mirror of
https://github.com/nocodb/nocodb.git
synced 2026-05-01 12:27:00 +00:00
290 lines
13 KiB
Bash
Executable File
290 lines
13 KiB
Bash
Executable File
#!/bin/bash
|
|
# caution: This script is for production
|
|
#
|
|
# forces ecs to relaunch the tasks/instances
|
|
# when relaunched a new docker image will be pulled
|
|
# resulting in rolling out a software/config or just
|
|
# restart.
|
|
#
|
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
|
# sends message to slack channel nocohub-deploy
|
|
function message(){
|
|
echo $@
|
|
curl -s -X POST -H "Content-type: application/json" --data "{\"text\":\"${@}\"}" https://hooks.slack.com/services/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
|
|
}
|
|
|
|
function log_and_exit(){
|
|
echo $@
|
|
message "${ENVIRONMENT}: deployment failed. Check logs for details. ${@}"
|
|
exit 1
|
|
}
|
|
|
|
# expects ALL_SVS and CLUSTER variable to be set to check status
|
|
function update_workspace(){
|
|
if [[ ! "${CLUSTER}" || ! "${ALL_SVS}" ]]; then echo "CLUSTER and ALL_SVS variables must be set for update_workspace"; log_and_exit ; fi
|
|
for SVC in ${ALL_SVS}
|
|
do
|
|
if [[ ${EXCLUDED_SVC} =~ " ${SVC} " ]]
|
|
then
|
|
echo "skip updating service : ${SVC}"
|
|
else
|
|
DEPLOY_OUT=$(aws ecs update-service --cluster ${CLUSTER} --service ${SVC} --force-new-deployment --region us-east-2 )
|
|
echo "updated service : ${SVC}"
|
|
fi
|
|
done
|
|
}
|
|
|
|
# expects ALL_SVS and CLUSTER variable to be set to check status
|
|
function check_status_all_workspaces(){
|
|
# start after initial sleep to avoid race
|
|
echo "Adding initial delay of 30 sec to check status "
|
|
sleep 30;
|
|
if [[ ! "${CLUSTER}" || ! "${ALL_SVS}" ]]; then echo "CLUSTER and ALL_SVS variables must be set for check status"; log_and_exit ; fi
|
|
for SVC in ${ALL_SVS}
|
|
do
|
|
if [[ ${EXCLUDED_SVC} =~ " ${SVC} " ]]
|
|
then
|
|
echo "skip status check for service : ${SVC}"
|
|
else
|
|
checkStatus "${SVC}"
|
|
fi
|
|
done
|
|
}
|
|
|
|
function checkStatus(){
|
|
# check if all deployments in the service is set to COMPLETED
|
|
local service=${1}
|
|
if [[ ! "${CLUSTER}" ]]; then echo "CLUSTER and service variable must be set for check status"; log_and_exit ; fi
|
|
|
|
local STATUS=$(aws ecs describe-services --cluster ${CLUSTER} --service ${service} --region us-east-2 | jq .services[].deployments[].rolloutState -r)
|
|
echo "First Check: ECS deployment status: ${STATUS} for ${service}.Retry count: ${global_retry_count}"
|
|
|
|
while [[ ${global_retry_count} -lt 20 && "${STATUS}" == *"IN_PROGRESS"* ]]
|
|
do
|
|
STATUS=$(aws ecs describe-services --cluster ${CLUSTER} --service ${service} --region us-east-2 | jq .services[].deployments[].rolloutState -r)
|
|
global_retry_count=$((global_retry_count+1))
|
|
echo "ECS deployment status: ${STATUS} for ${service}. Retry after 30 seconds. Retry count: ${global_retry_count}"
|
|
if [[ "${STATUS}" == *"IN_PROGRESS"* ]]; then
|
|
sleep 30;
|
|
fi
|
|
done
|
|
|
|
if [[ "${STATUS}" == *"IN_PROGRESS"* ]]
|
|
then
|
|
message "${ENVIRONMENT}: deployment status is IN_PROGRESS after waiting for about 10 mins for workspace ${service}. check status at https://us-east-2.console.aws.amazon.com/ecs/v2/clusters/${CLUSTER}/services/${service}/tasks?region=us-east-2 "
|
|
else
|
|
message "${ENVIRONMENT}: deployment completed successfully for workspace : ${service}"
|
|
fi
|
|
}
|
|
|
|
#
|
|
# checks the current count of instances
|
|
# updates it to double
|
|
# runs the check on instances with wait time of 10 seconds
|
|
function prewarm_asg(){
|
|
if [[ ! "${ASG_NAME}" ]]; then echo "ASG_NAME variables must be set for pre-warming"; log_and_exit ; fi
|
|
# Get the current desired count
|
|
prev_count=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names ${ASG_NAME} --region us-east-2 --query 'AutoScalingGroups[0].DesiredCapacity' --output text)
|
|
|
|
# Double the current count
|
|
new_count=$((prev_count * 2))
|
|
|
|
echo "${ENVIRONMENT}: prewarming initiating. previous_count: ${prev_count} new_count: ${new_count} "
|
|
|
|
# Update the desired count to be double
|
|
aws autoscaling set-desired-capacity --auto-scaling-group-name ${ASG_NAME} --region us-east-2 --desired-capacity $new_count
|
|
|
|
# Wait for the new instances to launch with doubled count
|
|
timeout=10
|
|
while [[ $timeout -gt 0 ]]; do
|
|
current_count=$(aws autoscaling describe-auto-scaling-groups --auto-scaling-group-names ${ASG_NAME} --region us-east-2 --query 'AutoScalingGroups[0].Instances[?LifecycleState==`InService`].InstanceId' --output text | wc -w)
|
|
|
|
if [[ $current_count -eq $new_count ]]; then
|
|
break
|
|
fi
|
|
|
|
sleep 1
|
|
((timeout--))
|
|
done
|
|
|
|
message "${ENVIRONMENT}: prewarming completed successfully. previous_count: ${prev_count} new_count: ${new_count} "
|
|
}
|
|
|
|
function wait_for_new_tasks(){
|
|
local service_name=${1}
|
|
local protected_tasks=${2}
|
|
local min_new_tasks=${3:-1} # Optional: minimum new tasks required (default: 1)
|
|
local retry_count=0
|
|
local max_retries=30
|
|
|
|
echo "Waiting for ${min_new_tasks} new tasks to start (different from protected tasks)..." >&2
|
|
|
|
while [[ ${retry_count} -lt ${max_retries} ]]; do
|
|
# Get all current running tasks for the service
|
|
local all_current_tasks=$(aws ecs list-tasks --cluster ${CLUSTER} --service-name ${service_name} --region us-east-2 --query 'taskArns[]' --output text 2>/dev/null)
|
|
|
|
if [[ ! -z "${all_current_tasks}" ]]; then
|
|
# Check if we have any new tasks (not in protected list)
|
|
local new_tasks=""
|
|
for task in ${all_current_tasks}; do
|
|
if [[ ! "${protected_tasks}" =~ "${task}" ]]; then
|
|
new_tasks="${new_tasks} ${task}"
|
|
fi
|
|
done
|
|
|
|
if [[ ! -z "${new_tasks}" ]]; then
|
|
# Check if new tasks are in RUNNING state
|
|
local running_new_tasks=$(aws ecs describe-tasks --cluster ${CLUSTER} --tasks ${new_tasks} --region us-east-2 --query 'tasks[?lastStatus==`RUNNING`].taskArn' --output text 2>/dev/null)
|
|
|
|
if [[ ! -z "${running_new_tasks}" ]]; then
|
|
# Count the number of running new tasks
|
|
local running_count=$(echo "${running_new_tasks}" | wc -w)
|
|
echo "Found ${running_count} new running tasks: ${running_new_tasks}" >&2
|
|
|
|
if [[ ${running_count} -ge ${min_new_tasks} ]]; then
|
|
echo "Minimum task requirement met (${running_count} >= ${min_new_tasks})" >&2
|
|
return 0
|
|
else
|
|
echo "Waiting for more tasks... (${running_count}/${min_new_tasks})" >&2
|
|
fi
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
retry_count=$((retry_count+1))
|
|
echo "Waiting for new tasks to be running... Retry ${retry_count}/${max_retries}" >&2
|
|
sleep 30
|
|
done
|
|
|
|
echo "Warning: Timeout waiting for new worker tasks to start. Proceeding anyway..." >&2
|
|
message "${ENVIRONMENT}: Warning - Timeout waiting for new worker tasks for ${service_name}"
|
|
return 1
|
|
}
|
|
|
|
function zero_downtime_worker_deployment(){
|
|
echo "zero_downtime_worker_deployment: Expected variables to be set CLUSTER=${CLUSTER} WORKERS_SERVICE_NAME=${WORKERS_SERVICE_NAME} HOST_NAME=${HOST_NAME} API_CREDENTIALS=$([[ ! -z "$API_CREDENTIALS" ]] && echo "***value-set***" || echo "Empty")"
|
|
|
|
if [[ ! "${CLUSTER}" || ! "${WORKERS_SERVICE_NAME}" || ! "${HOST_NAME}" || ! "${API_CREDENTIALS}" ]]; then
|
|
echo "WORKERS_SERVICE_NAME not set, skipping worker deployment"
|
|
return 0
|
|
fi
|
|
|
|
HOST_NAME=${HOST_NAME:-https://staging.noco.ws}
|
|
|
|
# Generate a unique worker group ID for new workers
|
|
WORKER_GROUP_ID="deploy-$(date +%s)-$(openssl rand -hex 4)"
|
|
|
|
message "${ENVIRONMENT}: Starting zero-downtime worker deployment with group ID: ${WORKER_GROUP_ID}"
|
|
|
|
# 1. Get current worker tasks and enable task protection
|
|
echo "Getting current worker tasks for service: ${WORKERS_SERVICE_NAME}"
|
|
|
|
# Get current task ARNs for the worker service
|
|
CURRENT_TASKS=$(aws ecs list-tasks --cluster ${CLUSTER} --service-name ${WORKERS_SERVICE_NAME} --region us-east-2 --query 'taskArns[]' --output text 2>/dev/null)
|
|
|
|
if [[ ! -z "${CURRENT_TASKS}" ]]; then
|
|
echo "Found current worker tasks: ${CURRENT_TASKS}"
|
|
|
|
# Enable task protection to prevent termination during force deployment
|
|
echo "Enabling task protection for current worker tasks"
|
|
aws ecs update-task-protection \
|
|
--cluster ${CLUSTER} \
|
|
--tasks ${CURRENT_TASKS} \
|
|
--protection-enabled \
|
|
--expires-in-minutes 300 \
|
|
--region us-east-2 || {
|
|
echo "Warning: Failed to enable task protection. Proceeding without protection."
|
|
}
|
|
|
|
message "${ENVIRONMENT}: Current workers are running with task protection enabled, proceeding with zero-downtime deployment"
|
|
PROTECTED_TASKS="${CURRENT_TASKS}"
|
|
|
|
# Assign worker group ID to old workers (if restarted it will be gone)
|
|
OLD_WORKER_GROUP_ID="deploy-$(date +%s)-$(openssl rand -hex 4)"
|
|
echo "Assigning worker group ID to old workers: ${OLD_WORKER_GROUP_ID}"
|
|
curl -u ${API_CREDENTIALS} ${HOST_NAME}/internal/workers/assign-worker-group -XPOST \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"workerGroupId\":\"${OLD_WORKER_GROUP_ID}\"}" || {
|
|
message "${ENVIRONMENT}: Failed to assign worker group ID"
|
|
return 1
|
|
}
|
|
else
|
|
echo "No current tasks found for worker service"
|
|
message "${ENVIRONMENT}: No current workers found, proceeding with standard deployment"
|
|
PROTECTED_TASKS=""
|
|
fi
|
|
|
|
# 2. Trigger force deployment for workers
|
|
echo "Triggering force deployment for worker service: ${WORKERS_SERVICE_NAME}"
|
|
aws ecs update-service --cluster ${CLUSTER} --service ${WORKERS_SERVICE_NAME} --force-new-deployment --region us-east-2
|
|
|
|
# 3. Wait for new instances to come up and be healthy (different from protected tasks)
|
|
echo "Waiting for new worker instances to be running..."
|
|
|
|
# Count the number of protected tasks to ensure we have the same capacity
|
|
if [[ ! -z "${CURRENT_TASKS}" ]]; then
|
|
PROTECTED_TASK_COUNT=$(echo ${CURRENT_TASKS} | wc -w)
|
|
echo "Protected tasks count: ${PROTECTED_TASK_COUNT}. Waiting for same number of new tasks..."
|
|
else
|
|
PROTECTED_TASK_COUNT=1
|
|
echo "No protected tasks found. Waiting for at least 1 new task..."
|
|
fi
|
|
|
|
wait_for_new_tasks "${WORKERS_SERVICE_NAME}" "${CURRENT_TASKS}" ${PROTECTED_TASK_COUNT}
|
|
NEW_TASKS_STARTED=$?
|
|
|
|
# 4. Add worker group ID to new workers
|
|
echo "Assigning worker group ID to new workers: ${WORKER_GROUP_ID}"
|
|
curl -u ${API_CREDENTIALS} ${HOST_NAME}/internal/workers/assign-worker-group -XPOST \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"workerGroupId\":\"${WORKER_GROUP_ID}\"}" || {
|
|
message "${ENVIRONMENT}: Failed to assign worker group ID"
|
|
return 1
|
|
}
|
|
|
|
# 5. Wait for id to propagate to new workers
|
|
echo "Waiting 10 seconds for new workers to be fully ready..."
|
|
sleep 10
|
|
|
|
|
|
# 6. Stop other worker groups (old workers) - only if new tasks started
|
|
if [ $NEW_TASKS_STARTED -eq 0 ]; then
|
|
echo "Stopping old worker groups (preserving group: ${WORKER_GROUP_ID})"
|
|
curl -u ${API_CREDENTIALS} ${HOST_NAME}/internal/workers/stop-other-worker-groups -XPOST \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"workerGroupId\":\"${WORKER_GROUP_ID}\"}" || {
|
|
message "${ENVIRONMENT}: Failed to stop other worker groups"
|
|
return 1
|
|
}
|
|
fi
|
|
|
|
message "${ENVIRONMENT}: Zero-downtime worker deployment completed successfully with group ID: ${WORKER_GROUP_ID}"
|
|
}
|
|
|
|
function perform_rollout(){
|
|
PROMOTE_IMAGE_BEFORE_ROLLOUT=${1:-false}
|
|
|
|
if [[ ! "${ENVIRONMENT}" || ! "${CLUSTER}" ]]; then echo "CLUSTER and ENVIRONMENT variables must be set for check status"; log_and_exit ; fi
|
|
|
|
global_retry_count=0
|
|
|
|
echo "${ENVIRONMENT}: deployment started."
|
|
|
|
if [[ "${PROMOTE_IMAGE_BEFORE_ROLLOUT}" == "true" && ( "${ENVIRONMENT}" == "Production" || "${ENVIRONMENT}" == "Prod-SQL-Executors") ]]
|
|
then
|
|
message "${ENVIRONMENT}: promoting ws-pre-release to ws before rollout."
|
|
${SCRIPT_DIR}/image_promote.sh "${ECR_REPO_NAME}" "${PRE_REL_STAGE_TAG}" "${STAGE_TAG}"
|
|
fi
|
|
|
|
latest_remote_digest=$(aws ecr batch-get-image --region us-east-2 --repository-name ${REPO_NAME:-nocohub} --image-ids imageTag=${STAGE_TAG} --output text --query images[].imageId )
|
|
message "${ENVIRONMENT}: Image with tag:${STAGE_TAG} will be launched. digest: ${latest_remote_digest}"
|
|
|
|
# TODO: prewarm ASG to have additional instances. update only desired
|
|
ALL_SVS=$( aws ecs list-services --cluster ${CLUSTER} --region us-east-2 | jq -r '.serviceArns[] | split("/") | .[2]')
|
|
update_workspace
|
|
check_status_all_workspaces
|
|
message "${ENVIRONMENT}: deployment executed successfully."
|
|
|
|
zero_downtime_worker_deployment
|
|
message "${ENVIRONMENT}: zero downtime worker deployment executed successfully."
|
|
} |