Label PRs when the json schema changes (#2240)

* label PRs when the json schema changes

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* moderate pr comments

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

* be more strict about processing file names

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>

---------

Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
This commit is contained in:
Alex Goodman 2023-10-20 13:00:15 -04:00 committed by GitHub
parent ef43294d0e
commit 8f6bdde666
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 354 additions and 1 deletions

224
.github/scripts/labeler.py vendored Normal file
View File

@ -0,0 +1,224 @@
from __future__ import annotations
import sys
import glob
import subprocess
import os
import re
DRY_RUN = False
def main(changed_files: str | None = None, merge_base_schema_files: str | None = None):
global DRY_RUN
pr_number = os.environ.get("GITHUB_PR_NUMBER")
comment_file_path = os.environ.get("CI_COMMENT_FILE")
if not comment_file_path:
print("CI_COMMENT_FILE not set")
sys.exit(1)
if not pr_number:
DRY_RUN = True
if changed_files:
DRY_RUN = True
# read lines from file... this is useful for local testing
with open(changed_files) as f:
pr_changed_files = f.read().splitlines()
with open(merge_base_schema_files) as f:
og_json_schema_files = sort_json_schema_files(f.read().splitlines())
else:
if not is_ci():
print("Not in CI")
sys.exit(1)
if not pr_number:
print("Not a PR")
sys.exit(1)
pr_changed_files = get_pr_changed_files(pr_number)
# since we are running this in the context of the pull_request_target, the checkout is the merge base..
# that is the main branch of the original repo, NOT the branch in the forked repo (or branch in the target
# repo for non-forked PRs). This means we just need to list the current checkedout files to get a sense of
# the changes before a merge.
og_json_schema_files = list_json_schema_files()
pr_json_schema_files = filter_to_schema_files(pr_changed_files)
# print("schema files in pr: ", summarize_schema_files(pr_json_schema_files))
# print("og schema files: ", summarize_schema_files(og_json_schema_files))
if not og_json_schema_files:
print("No schema files found in merge base")
sys.exit(1)
# pr_json_schema_files = set of PR files are added, removed, and changed files
new_schema_files = set(pr_json_schema_files) - set(og_json_schema_files)
removed_or_modified_schema_files = set(pr_json_schema_files) - set(new_schema_files)
print("new schemas: ", summarize_schema_files(new_schema_files))
print("removed or modified schemas:", summarize_schema_files(removed_or_modified_schema_files))
# if there is a new or modified schema, we should add the "json-schema" label to the PR...
if new_schema_files or removed_or_modified_schema_files:
print("\nAdding json-schema label...")
add_label(pr_number, "json-schema")
else:
remove_label(pr_number, "json-schema")
# new schema files should be scrutinized, comparing the latest and added versions to see if it's a breaking
# change (major version bump). Warn about it on the PR via adding a breaking-change label...
if is_breaking_change(new_schema_files, og_json_schema_files[-1]):
print("\nBreaking change detected...")
add_label(pr_number, "breaking-change")
else:
remove_label(pr_number, "breaking-change")
# modifying an existing schema could be a breaking change, we should warn about it on the PR via a comment...
# removing schema files should never be allowed, we should warn about it on the PR via a comment...
if removed_or_modified_schema_files:
print("\nRemoved or modified schema detected...")
schemas = sort_json_schema_files(list(removed_or_modified_schema_files))
schemas_str = "\n".join([f" - {schema}" for schema in schemas])
add_comment(comment_file_path, f"Detected modification or removal of existing json schemas:\n{schemas_str}", warning=True)
def add_comment(comment_file_path: str, comment: str, warning: bool = False, important: bool = False):
if warning or important:
comment_lines = comment.splitlines()
comment = "\n".join([f"> {line}" for line in comment_lines])
if warning:
comment = f"> [!WARNING]\n{comment}"
elif important:
comment = f"> [!IMPORTANT]\n{comment}"
# create any parent directories if they don't exist
os.makedirs(os.path.dirname(comment_file_path), exist_ok=True)
with open(comment_file_path, "w") as f:
f.write(comment)
print(f"Comment file contents: {comment_file_path}")
print(comment)
def add_label(pr_number: str, label: str):
# run "gh pr edit --add-label <label>"
result = run(f"gh pr edit {pr_number} --add-label {label}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if result.returncode != 0:
print(f"Unable to add {label!r} label to PR with")
print(str(result.stderr))
sys.exit(1)
def remove_label(pr_number: str, label: str):
# run "gh pr edit --remove-label <label>"
result = run(f"gh pr edit {pr_number} --remove-label {label}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if result.returncode != 0:
print(f"Unable to label PR with {label!r}")
print(str(result.stderr))
sys.exit(1)
def major_version(semver: str) -> int:
return int(semver.split(".")[0])
def is_breaking_change(new_schema_files: set[str], latest_schema_file: str) -> bool:
latest_major_version = major_version(get_semver(latest_schema_file))
for file in new_schema_files:
change_major_version = major_version(get_semver(file))
if change_major_version > latest_major_version:
return True
return False
def summarize_schema_files(files: list[str]) -> list[str]:
return [get_semver(file) for file in files]
def is_ci() -> bool:
return "CI" in os.environ
def get_pr_changed_files(pr_number: str) -> list[str]:
result = run(f"gh pr view {pr_number} --json files --jq '.files.[].path'", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
if result.returncode != 0:
print("Unable to get list of changed files in PR")
print(str(result.stderr))
sys.exit(1)
list_of_files = result.stdout.splitlines()
return list_of_files
def filter_to_schema_files(list_of_files: list[str]) -> list[str]:
# get files matching "schema/json/schema-*.json"
files = []
for file in list_of_files:
if re.match(r"^schema/json/schema-\d+\.\d+\.\d+\.json$", file):
files.append(file)
return sort_json_schema_files(files)
def list_json_schema_files() -> list[str]:
# list files in "schema/json" directory matching the pattern of "schema-*.json"
return sort_json_schema_files(list(glob.glob("schema/json/schema-*.json")))
def run(command: str, **kwargs) -> subprocess.CompletedProcess:
if DRY_RUN:
print(f"[DRY RUN] {command}")
return subprocess.CompletedProcess(args=[command], returncode=0)
print(f"[RUN] {command}")
return subprocess.run(command, **kwargs)
def get_semver(input_file: str) -> str:
return input_file.split("-")[1].split(".json")[0]
def sort_json_schema_files(files: list[str]) -> list[str]:
# sort files by schema version, where the input looks like "schema/json/schema-1.12.1.json"
# we should sort by the semantic version embedded within the basename, not the string
# so that "schema/json/schema-1.2.1.json" comes before "schema/json/schema-1.12.1.json".
versions = [get_semver(file) for file in files if file]
versions = sorted(versions, key=lambda s: [int(u) for u in s.split('.')])
return [f"schema/json/schema-{version}.json" for version in versions]
# allow for test files that have line-by-line list of files:
# .binny.yaml
# .github/actions/bootstrap/action.yaml
# .github/scripts/goreleaser-install.sh
# .github/workflows/release.yaml
# .github/workflows/update-bootstrap-tools.yml
# .github/workflows/update-cpe-dictionary-index.yml
# .github/workflows/update-stereoscope-release.yml
# .github/workflows/validations.yaml
# .gitignore
# .goreleaser.yaml
# Makefile
# Taskfile.yaml
# schema/cyclonedx/Makefile
if __name__ == "__main__":
# these are variables for a single file name that contains a list of files (line separated)
changed_files = None
merge_base_schema_files = None
if len(sys.argv) > 2:
changed_files = sys.argv[1]
merge_base_schema_files = sys.argv[2]
main(changed_files, merge_base_schema_files)

65
.github/scripts/labeler_test.py vendored Normal file
View File

@ -0,0 +1,65 @@
import unittest
from unittest.mock import patch
import subprocess
import labeler
class Labeler(unittest.TestCase):
def test_major_version(self):
self.assertEqual(labeler.major_version("1.2.3"), 1)
self.assertEqual(labeler.major_version("2.0.0"), 2)
def test_is_breaking_change(self):
new_schema_files = ["schema/json/schema-2.0.0.json"]
latest_schema_file = "schema/json/schema-1.2.0.json"
self.assertTrue(labeler.is_breaking_change(new_schema_files, latest_schema_file))
new_schema_files = ["schema/json/schema-1.3.0.json"]
latest_schema_file = "schema/json/schema-1.2.0.json"
self.assertFalse(labeler.is_breaking_change(new_schema_files, latest_schema_file))
def test_summarize_schema_files(self):
files = ["schema/json/schema-1.0.0.json", "schema/json/schema-2.0.0.json"]
expected = ["1.0.0", "2.0.0"]
self.assertEqual(labeler.summarize_schema_files(files), expected)
def test_is_ci(self):
# Mock os.environ to simulate CI environment
with patch.dict("os.environ", {"CI": "true"}):
self.assertTrue(labeler.is_ci())
def test_get_pr_changed_files(self):
expected_command = "gh pr view 123 --json files --jq '.files.[].path'"
expected_output = "file1.json\nfile2.json\n"
subprocess.CompletedProcess.returncode = 0
subprocess.CompletedProcess.stdout = expected_output
with patch("labeler.run", return_value=subprocess.CompletedProcess) as mock_run:
result = labeler.get_pr_changed_files("123")
mock_run.assert_called_with(expected_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
self.assertEqual(result, ["file1.json", "file2.json"])
def test_filter_to_schema_files(self):
input_files = ["schema/json/schema-1.0.0.json", "not_schema.txt", "schema/json/schema-2.0.0.json"]
expected_files = ["schema/json/schema-1.0.0.json", "schema/json/schema-2.0.0.json"]
self.assertEqual(labeler.filter_to_schema_files(input_files), expected_files)
# we should be strict about what files are allowed to be processed
input_files = ["schema/json/schema-1.0.0extracontent.json", "schema/json/schema-1.0.0.md", "schema/json/schema-1.0.0.json.extracontent"]
expected_files = []
self.assertEqual(labeler.filter_to_schema_files(input_files), expected_files)
def test_get_semver(self):
input_file = "schema/json/schema-1.0.0.json"
expected_semver = "1.0.0"
self.assertEqual(labeler.get_semver(input_file), expected_semver)
def test_sort_json_schema_files(self):
files = ["schema/json/schema-1.12.1.json", "schema/json/schema-1.2.1.json"]
expected_sorted_files = ["schema/json/schema-1.2.1.json", "schema/json/schema-1.12.1.json"]
self.assertEqual(labeler.sort_json_schema_files(files), expected_sorted_files)
if __name__ == "__main__":
unittest.main()

54
.github/workflows/labeler.yaml vendored Normal file
View File

@ -0,0 +1,54 @@
name: "Detect schema changes"
on:
# IMPORTANT! This workflow is triggered by the `pull_request_target` event
# which means that forked PRs will run with access secrets from the repo
# it's forked from (the "target" repo).
#
# For this reason we only NEVER checkout the code from the pull request
# (e.g. "ref: ${{ github.event.pull_request.head.sha }}") to prevent
# accidentally running potentially untrusted code.
#
# By default the checkout will be:
# - GITHUB_SHA: Last commit on the PR base branch
# - GITHUB_REF: PR base branch
#
# ...unlike a typical PR where:
# - GITHUB_SHA: Last merge commit on the GITHUB_REF branch
# - GITHUB_REF: PR merge branch refs/pull/:prNumber/merge
pull_request_target:
env:
# note: this is used within hashFiles() so must be within the GITHUB_WORKSPACE path (or will silently fail)
CI_COMMENT_FILE: .tmp/labeler-comment.txt
# needs to be any string to uniquely identify the comment on a PR across multiple runs
COMMENT_HEADER: "label-commentary"
jobs:
label:
name: "Label changes"
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 #v4.1.1
- run: python .github/scripts/labeler.py
env:
# note: this token has write access to the repo
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GITHUB_PR_NUMBER: ${{ github.event.number }}
- name: Delete existing comment
if: ${{ hashFiles( env.CI_COMMENT_FILE ) == '' }}
uses: marocchino/sticky-pull-request-comment@efaaab3fd41a9c3de579aba759d2552635e590fd #v2.8.0
with:
header: ${{ env.COMMENT_HEADER }}
hide: true
hide_classify: "OUTDATED"
- name: Add comment
if: ${{ hashFiles( env.CI_COMMENT_FILE ) != '' }}
uses: marocchino/sticky-pull-request-comment@efaaab3fd41a9c3de579aba759d2552635e590fd #v2.8.0
with:
header: ${{ env.COMMENT_HEADER }}
path: ${{ env.CI_COMMENT_FILE }}

5
.gitignore vendored
View File

@ -64,3 +64,8 @@ test/integration/test-fixtures/**/go.sum
# attestation # attestation
cosign.key cosign.key
cosign.pub cosign.pub
# Byte-compiled object files for python
__pycache__/
*.py[cod]
*$py.class

View File

@ -6,6 +6,7 @@ In order to test and develop in this repo you will need the following dependenci
- Golang - Golang
- docker - docker
- make - make
- Python (>= 3.9)
### Docker settings for getting started ### Docker settings for getting started
Make sure you've updated your docker settings so the default docker socket path is available. Make sure you've updated your docker settings so the default docker socket path is available.

View File

@ -70,7 +70,7 @@ all: static-analysis test ## Run all linux-based checks (linting, license check,
static-analysis: check-go-mod-tidy lint check-licenses check-json-schema-drift ## Run all static analysis checks static-analysis: check-go-mod-tidy lint check-licenses check-json-schema-drift ## Run all static analysis checks
.PHONY: test .PHONY: test
test: unit integration validate-cyclonedx-schema benchmark cli ## Run all tests (currently unit, integration, linux compare, and cli tests) test: unit integration validate-cyclonedx-schema benchmark test-utils cli ## Run all tests (currently unit, integration, linux compare, and cli tests)
## Bootstrapping targets ################################# ## Bootstrapping targets #################################
@ -167,6 +167,10 @@ cli: $(SNAPSHOT_DIR) ## Run CLI tests
SYFT_BINARY_LOCATION='$(SNAPSHOT_BIN)' \ SYFT_BINARY_LOCATION='$(SNAPSHOT_BIN)' \
go test -count=1 -timeout=15m -v ./test/cli go test -count=1 -timeout=15m -v ./test/cli
.PHONY: test-utils
test-utils:
python .github/scripts/labeler_test.py
## Benchmark test targets ################################# ## Benchmark test targets #################################