From fa40b09311f95c76d4cce6e2df218a95769ed059 Mon Sep 17 00:00:00 2001 From: Jason Rhinelander Date: Fri, 5 Apr 2024 16:15:08 -0300 Subject: [PATCH] Use keep-alive for simulators This adds keep-alive scripts to help deal with cleaning up xcode simulator devices started by CI jobs. Other changes: - Change /Users/drone -> /Users/$USER in case we use some other CI username in the future. - Simplify the xcode simulator interaction a bit by using xcode's json output + jq to query/manipulate it. --- .drone.jsonnet | 71 ++++++++++++++++++------------- Scripts/ci-host/README.md | 59 ++++++++++++++++++++++++++ Scripts/ci-host/cleanup.py | 33 +++++++++++++++ Scripts/ci-host/keepalive.sh | 82 ++++++++++++++++++++++++++++++++++++ 4 files changed, 215 insertions(+), 30 deletions(-) create mode 100644 Scripts/ci-host/README.md create mode 100755 Scripts/ci-host/cleanup.py create mode 100755 Scripts/ci-host/keepalive.sh diff --git a/.drone.jsonnet b/.drone.jsonnet index bd06b45c3..3a99dd630 100644 --- a/.drone.jsonnet +++ b/.drone.jsonnet @@ -41,22 +41,22 @@ local load_cocoapods_cache = { commands: [ ||| LOOP_BREAK=0 - while test -e /Users/drone/.cocoapods_cache.lock; do + while test -e /Users/$USER/.cocoapods_cache.lock; do sleep 1 LOOP_BREAK=$((LOOP_BREAK + 1)) if [[ $LOOP_BREAK -ge 600 ]]; then - rm -f /Users/drone/.cocoapods_cache.lock + rm -f /Users/$USER/.cocoapods_cache.lock fi done |||, - 'touch /Users/drone/.cocoapods_cache.lock', + 'touch /Users/$USER/.cocoapods_cache.lock', ||| - if [[ -d /Users/drone/.cocoapods_cache ]]; then - cp -r /Users/drone/.cocoapods_cache ./Pods + if [[ -d /Users/$USER/.cocoapods_cache ]]; then + cp -r /Users/$USER/.cocoapods_cache ./Pods fi |||, - 'rm -f /Users/drone/.cocoapods_cache.lock' + 'rm -f /Users/$USER/.cocoapods_cache.lock', ], depends_on: [ 'Clone Submodules' @@ -69,26 +69,50 @@ local update_cocoapods_cache(depends_on) = { commands: [ ||| LOOP_BREAK=0 - while test -e /Users/drone/.cocoapods_cache.lock; do + while test -e /Users/$USER/.cocoapods_cache.lock; do sleep 1 LOOP_BREAK=$((LOOP_BREAK + 1)) if [[ $LOOP_BREAK -ge 600 ]]; then - rm -f /Users/drone/.cocoapods_cache.lock + rm -f /Users/$USER/.cocoapods_cache.lock fi done |||, - 'touch /Users/drone/.cocoapods_cache.lock', + 'touch /Users/$USER/.cocoapods_cache.lock', ||| if [[ -d ./Pods ]]; then - rsync -a --delete ./Pods/ /Users/drone/.cocoapods_cache + rsync -a --delete ./Pods/ /Users/$USER/.cocoapods_cache fi |||, - 'rm -f /Users/drone/.cocoapods_cache.lock' + 'rm -f /Users/$USER/.cocoapods_cache.lock', ], depends_on: depends_on, }; +local boot_simulator(device_type) = { + name: 'Boot Test Simulator', + commands: [ + 'devname="Test-iPhone14-${DRONE_COMMIT:0:9}-${DRONE_BUILD_EVENT}"', + 'xcrun simctl create "$devname" ' + device_type, + 'sim_uuid=$(xcrun simctl list devices -je | jq -re \'[.devices[][] | select(.name == "\'$devname\'").udid][0]\')', + 'xcrun simctl boot $sim_uuid', + + 'mkdir -p build/artifacts', + 'echo $sim_uuid > ./build/artifacts/sim_uuid', + 'echo $devname > ./build/artifacts/device_name', + + 'xcrun simctl list -je devices $sim_uuid | jq -r \'.devices[][0] | "\\u001b[32;1mSimulator " + .state + ": \\u001b[34m" + .name + " (\\u001b[35m" + .deviceTypeIdentifier + ", \\u001b[36m" + .udid + "\\u001b[34m)\\u001b[0m"\'', + ], +}; +local sim_keepalive = { + name: '(Simulator keep-alive)', + commands: [ + '/Users/$USER/sim-keepalive/keepalive.sh $(<./build/artifacts/sim_uuid)', + ], + depends_on: ['Boot Test Simulator'], +}; +local sim_delete_cmd = 'if [ -f build/artifacts/sim_uuid ]; then rm -f /Users/$USER/sim-keepalive/$(<./build/artifacts/sim_uuid); fi'; + [ // Unit tests (PRs only) { @@ -102,36 +126,23 @@ local update_cocoapods_cache(depends_on) = { clone_submodules, load_cocoapods_cache, install_cocoapods, - { - name: 'Clean Up Old Test Simulators', - commands: [ - './Scripts/clean-up-old-test-simulators.sh' - ] - }, - { - name: 'Pre-Boot Test Simulator', - commands: [ - 'mkdir -p build/artifacts', - 'echo "Test-iPhone14-${DRONE_COMMIT:0:9}-${DRONE_BUILD_EVENT}" > ./build/artifacts/device_name', - 'xcrun simctl create "$(<./build/artifacts/device_name)" com.apple.CoreSimulator.SimDeviceType.iPhone-14', - 'echo $(xcrun simctl list devices | grep -m 1 $(<./build/artifacts/device_name) | grep -E -o -i "([0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12})") > ./build/artifacts/sim_uuid', - 'xcrun simctl boot $(<./build/artifacts/sim_uuid)', - 'echo "Pre-booting simulator complete: $(xcrun simctl list | sed "s/^[[:space:]]*//" | grep -o ".*$(<./build/artifacts/sim_uuid).*")"', - ] - }, + + boot_simulator('com.apple.CoreSimulator.SimDeviceType.iPhone-14'), + sim_keepalive, { name: 'Build and Run Tests', commands: [ 'NSUnbufferedIO=YES set -o pipefail && xcodebuild test -workspace Session.xcworkspace -scheme Session -derivedDataPath ./build/derivedData -resultBundlePath ./build/artifacts/testResults.xcresult -parallelizeTargets -destination "platform=iOS Simulator,id=$(<./build/artifacts/sim_uuid)" -parallel-testing-enabled NO -test-timeouts-enabled YES -maximum-test-execution-time-allowance 10 -collect-test-diagnostics never 2>&1 | xcbeautify --is-ci', ], depends_on: [ - 'Pre-Boot Test Simulator', - 'Install CocoaPods' + 'Boot Test Simulator', + 'Install CocoaPods', ], }, { name: 'Unit Test Summary', commands: [ + sim_delete_cmd, ||| if [[ -d ./build/artifacts/testResults.xcresult ]]; then xcresultparser --output-format cli --failed-tests-only ./build/artifacts/testResults.xcresult diff --git a/Scripts/ci-host/README.md b/Scripts/ci-host/README.md new file mode 100644 index 000000000..44b07ab3c --- /dev/null +++ b/Scripts/ci-host/README.md @@ -0,0 +1,59 @@ +# Xcode simulator keepalive/cleanup + +Keep-alive directory for simulators managed by xcode that may be created during +drone CI jobs. + +These scripts are placed in a /Users/$USER/sim-keepalive directory; keepalive.sh +is called from a CI job to set up a keepalive, while cleanup.py is intended to +be run once/minute via cron to deal with cleaning up old simulators from killed +CI pipelines. + +The directory itself will have files created that look like a UDID and are +checked periodically (by cleanup.py); any that have a timestamp less than the +current time will be deleted. + +## Simple timeout + +A CI job can invoke the keepalive.sh script with a UDID value and a time +interval: the keepalive script will set up a file that will keep the simulator +alive for the given interval, then deletes it once the interval is passed. The +script exits immediately in this mode. Shortly (up to a minute) after the +timestamp is reached the simulator device will be deleted (if it still exists). +For example: + + /Users/$USER/sim-keepalive/keepalive.sh $udid "5 minutes" + +for a fixed 5-minute cleanup timeout. + +## Indefinite timeout + +For a job where the precise time required isn't known or varies significantly +there is a script in this directory that provides a simple keep-alive script +that will create and periodically update the $udid file to keep the simulator +alive. + +This is moderately more complex to set up as you must add a parallel job (using +`depends_on`) to the CI pipeline that runs the script for the duration of the +steps that require the simulator: + + /Users/$USER/sim-keepalive/keepalive.sh $udid + +the script periodically touches the sim-keepalive/$udid to keep the simulator +alive as long as the keep alive script runs. To stop the keepalive (i.e. when +the task is done) simply run: + + rm /Users/$USER/sim-keepalive/$udid + +which will cause the keepalive script to immediately shut down the simulator +with the given UDID and then exits the keepalive script. + +If the pipeline gets killed, the keepalive script stops updating the file and +the simulator will be killed by the periodic cleanup within the next couple +minutes. + +# crontab entry + +A crontab entry must be added to run the CI user's crontab to periodically run +cleanup.py: + + * * * * * ~/sim-keepalive/cleanup.py diff --git a/Scripts/ci-host/cleanup.py b/Scripts/ci-host/cleanup.py new file mode 100755 index 000000000..64c8e779f --- /dev/null +++ b/Scripts/ci-host/cleanup.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +import os +import subprocess +import shutil +import time +import json + +os.chdir(os.path.dirname(os.path.abspath(__file__))) + +subprocess.run(["xcrun", "simctl", "delete", "unavailable"], check=True) + +simctl_list = json.loads(subprocess.run(["xcrun", "simctl", "list", "devices", "-je"], check=True, stdout=subprocess.PIPE).stdout) + +now = time.time() + +for rt, devs in simctl_list.get("devices", {}).items(): + for dev in devs: + udid = dev["udid"] + nuke_it = False + if os.path.isfile(udid): + if os.path.getmtime(udid) <= now: + nuke_it = True + os.remove(udid) + # else the keepalive file is still active + elif os.path.getmtime(dev["dataPath"]) <= now - 3600: + # no keep-alive and more than an hour old so kill it + nuke_it = True + + if nuke_it: + subprocess.run(["xcrun", "simctl", "delete", udid]) + if os.path.exists(dev["logPath"]): + shutil.rmtree(dev["logPath"]) diff --git a/Scripts/ci-host/keepalive.sh b/Scripts/ci-host/keepalive.sh new file mode 100755 index 000000000..324b35fa0 --- /dev/null +++ b/Scripts/ci-host/keepalive.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +set -e + +if ! [[ "$1" =~ ^[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}$ ]]; then + echo "Error: expected single UDID argument. Usage: $0 XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" >&2 + exit 1 +fi + +UDID=$1 + +cd $(gdirname $(greadlink -f $0)) + +reset="\e[0m" +red="\e[31;1m" +green="\e[32;1m" +yellow="\e[33;1m" +blue="\e[34;1m" +cyan="\e[36;1m" + +if [ -n "$2" ]; then + gtouch --date "$2" $UDID + + echo -e "\n${green}Started a $2 one-shot cleanup timer for device $cyan$UDID${reset}" + + exit 0 +fi + +echo -e "\n${green}Starting keep-alive for device $cyan$UDID${reset}" + +gtouch --date '30 seconds' $UDID +last_print=0 +last_touch=$EPOCHSECONDS +started=$EPOCHSECONDS + +function print_state() { + if ! xcrun simctl list -je devices $UDID | + jq -er '.devices[][] | "Current state: \u001b[32;1m" + .state + " \u001b[34m(" + .name + ", \u001b[35m" + .deviceTypeIdentifier + ", \u001b[36m" + .udid + "\u001b[34m)\u001b[0m"'; then + echo -e "Current state: $cyan$UDID ${red}not found$reset" + fi +} + +while true; do + if [[ $EPOCHSECONDS -gt $((last_touch + 10)) ]]; then + last_touch=$EPOCHSECONDS + gtouch --no-create --date '30 seconds' $UDID + fi + + if [ ! -f $UDID ]; then + echo -e "$cyan$UDID ${yellow}keep-alive file vanished${reset}" + if xcrun simctl list -je devices $UDID | jq -e "any(.devices.[][]; .)" >/dev/null; then + logdir="$(xcrun simctl list devices -je $UDID | jq '.devices[][0].logPath')" + echo -e "$blue ... shutting down device${reset}" + xcrun simctl shutdown $UDID + print_state + echo -e "$blue ... deleting device${reset}" + xcrun simctl delete $UDID + print_state + if [ "$logdir" != "null" ] && [ -d "$logdir" ]; then + echo -e "$blue ... deleting log directory $logdir${reset}" + rm -rf "$logdir" + fi + + else + echo -e "\n${yellow}Device ${cyan}$UDID${yellow} no longer exists!${reset}" + fi + + echo -e "\n${green}All done.${reset}" + exit 0 + fi + + if [[ $EPOCHSECONDS -gt $((last_print + 30)) ]]; then + last_print=$EPOCHSECONDS + print_state + fi + + if [[ $EPOCHSECONDS -gt $((started + 7200)) ]]; then + echo -e "${red}2-hour timeout reached; exiting to allow cleanup${reset}" + fi + + sleep 0.5 +done