Error Handling and Resilience
Error Handling and Resilience
Learning Objectives
- Master pipeline error handling techniques
- Implement robust error recovery mechanisms
- Build resilient pipeline architectures
- Develop effective monitoring and alerting strategies
Understanding Pipeline Failures
Common Pipeline Failure Points
- Build Failures
- Compilation errors
- Missing dependencies
- Resource constraints
- Test Failures
- Failed unit tests
- Integration test issues
- Performance test thresholds
- Deployment Issues
- Environment configuration
- Network problems
- Service dependencies
Error Handling Techniques
1. Basic Error Handling
pipeline {
agent any
stages {
stage('Build') {
steps {
script {
try {
sh 'mvn clean package'
} catch (Exception e) {
echo "Build failed: ${e.message}"
currentBuild.result = 'FAILURE'
error "Build stage failed"
}
}
}
}
}
}
2. Advanced Error Recovery
pipeline {
agent any
environment {
MAX_RETRIES = 3
RETRY_DELAY = 60 // seconds
}
stages {
stage('Deploy with Retry') {
steps {
script {
def retryCount = 0
def deployed = false
while (!deployed && retryCount < MAX_RETRIES) {
try {
deploy()
deployed = true
} catch (Exception e) {
retryCount++
if (retryCount < MAX_RETRIES) {
echo "Deployment failed, attempt ${retryCount} of ${MAX_RETRIES}"
sleep(RETRY_DELAY)
} else {
error "Deployment failed after ${MAX_RETRIES} attempts"
}
}
}
}
}
}
}
}
def deploy() {
// Deployment logic here
sh './deploy.sh'
}
3. Parallel Error Handling
pipeline {
agent any
stages {
stage('Parallel Tests') {
steps {
script {
def testResults = [:]
parallel(
'Unit Tests': {
try {
sh 'mvn test'
testResults['unit'] = 'SUCCESS'
} catch (Exception e) {
testResults['unit'] = 'FAILURE'
throw e
}
},
'Integration Tests': {
try {
sh 'mvn integration-test'
testResults['integration'] = 'SUCCESS'
} catch (Exception e) {
testResults['integration'] = 'FAILURE'
throw e
}
},
failFast: false
)
// Analysis of test results
def failedTests = testResults.findAll { it.value == 'FAILURE' }
if (failedTests) {
error "Tests failed: ${failedTests.keySet().join(', ')}"
}
}
}
}
}
}
Building Resilient Pipelines
1. Timeout Handling
pipeline {
agent any
options {
timeout(time: 1, unit: 'HOURS')
}
stages {
stage('Long Running Process') {
steps {
timeout(time: 30, unit: 'MINUTES') {
script {
try {
sh './long-running-process.sh'
} catch (org.jenkinsci.plugins.workflow.steps.TimeoutStepExecution.ExceededTimeout e) {
echo "Process exceeded timeout"
// Cleanup or recovery actions
error "Process timed out"
}
}
}
}
}
}
}
2. Resource Management
pipeline {
agent any
environment {
MAX_MEMORY = '4G'
MAX_CPU = '2'
}
stages {
stage('Resource-Intensive Task') {
steps {
script {
try {
sh """
docker run \
--memory=${MAX_MEMORY} \
--cpus=${MAX_CPU} \
my-app:latest
"""
} catch (Exception e) {
if (e.message.contains('insufficient memory')) {
echo "Insufficient resources, attempting cleanup"
sh 'docker system prune -f'
error "Resource allocation failed"
} else {
throw e
}
}
}
}
}
}
}
3. Network Resilience
pipeline {
agent any
environment {
CURL_RETRY = '5'
CURL_RETRY_DELAY = '10'
CURL_TIMEOUT = '30'
}
stages {
stage('API Integration') {
steps {
script {
def response = sh(
script: """
curl \
--retry ${CURL_RETRY} \
--retry-delay ${CURL_RETRY_DELAY} \
--max-time ${CURL_TIMEOUT} \
-s -w '%{http_code}' \
https://api.example.com/endpoint
""",
returnStdout: true
).trim()
if (response != '200') {
error "API request failed with status ${response}"
}
}
}
}
}
}
Monitoring and Alerting
1. Pipeline Metrics
pipeline {
agent any
environment {
BUILD_START_TIME = System.currentTimeMillis()
}
stages {
stage('Build and Test') {
steps {
script {
def stageStartTime = System.currentTimeMillis()
try {
sh 'mvn clean package'
} finally {
def duration = System.currentTimeMillis() - stageStartTime
recordMetric('build_duration', duration)
}
}
}
}
}
post {
always {
script {
def totalDuration = System.currentTimeMillis() - BUILD_START_TIME
recordMetric('total_build_duration', totalDuration)
// Send metrics to monitoring system
sendMetricsToPrometheus()
}
}
}
}
def recordMetric(String name, Object value) {
echo "METRIC ${name}=${value}"
// Implementation to store metric
}
def sendMetricsToPrometheus() {
// Implementation to send metrics to Prometheus
}
2. Alert Integration
pipeline {
agent any
environment {
SLACK_CHANNEL = '#deployments'
PAGERDUTY_SERVICE = 'PROD-PIPELINE'
}
stages {
stage('Critical Operation') {
steps {
script {
try {
performCriticalOperation()
} catch (Exception e) {
notifyFailure(e)
throw e
}
}
}
}
}
post {
failure {
script {
// Slack notification
slackSend(
channel: SLACK_CHANNEL,
color: 'danger',
message: "Pipeline failed: ${currentBuild.fullDisplayName}"
)
// PagerDuty incident
pagerduty(
serviceKey: PAGERDUTY_SERVICE,
incidentKey: "${env.JOB_NAME}-${env.BUILD_NUMBER}",
description: "Pipeline failure in ${env.JOB_NAME}",
details: "Build URL: ${env.BUILD_URL}"
)
}
}
}
}
def performCriticalOperation() {
// Critical operation implementation
}
def notifyFailure(Exception e) {
def errorDetails = [
job: env.JOB_NAME,
build: env.BUILD_NUMBER,
error: e.message,
stage: env.STAGE_NAME,
url: env.BUILD_URL
]
// Send detailed error notification
emailext(
subject: "Pipeline Failure: ${env.JOB_NAME}",
body: generateErrorReport(errorDetails),
to: 'team@example.com'
)
}
def generateErrorReport(Map details) {
// Generate detailed HTML error report
return """
<h2>Pipeline Failure Report</h2>
<p>Job: ${details.job}</p>
<p>Build: ${details.build}</p>
<p>Stage: ${details.stage}</p>
<p>Error: ${details.error}</p>
<p><a href="${details.url}">Build Details</a></p>
"""
}
Best Practices
1. Error Prevention
- Implement input validation
- Use environment checks
- Verify dependencies
- Test error scenarios
2. Error Recovery
- Implement graceful degradation
- Use circuit breakers
- Implement fallback mechanisms
- Clean up resources
3. Monitoring
- Track error rates
- Monitor performance metrics
- Set up alerts
- Maintain audit logs
Exercises
Exercise 1: Implement Resilient Deployment Pipeline
Create a pipeline that implements various resilience patterns:
```groovy pipeline { agent any
environment {
APP_NAME = 'resilient-app'
MAX_RETRIES = 3
HEALTH_CHECK_RETRIES = 5
HEALTH_CHECK_DELAY = 10
}
stages {
stage('Deploy') {
steps {
script {
def deployed = false
def attempts = 0
while (!deployed && attempts < MAX_RETRIES) {
try {
// Deploy application
deploy()
// Verify deployment
if (verifyDeployment()) {
deployed = true
} else {
throw new Exception("Deployment verification failed")
}
} catch (Exception e) {
attempts++
if (attempts < MAX_RETRIES) {
echo "Deployment attempt ${attempts} failed, retrying..."
sleep(time: 30, unit: 'SECONDS')
} else {
error "Deployment failed after ${MAX_RETRIES} attempts"
}
}
}
}
}
}
stage('Health Check') {
steps {
script {
def healthy = false
def checks = 0
while (!healthy && checks < HEALTH_CHECK_RETRIES) {
try {
def status = sh(
script: "curl -s -w '%{http_code}' http://localhost:8080/health",
returnStdout: true
).trim()
if (status == '200') {
healthy = true
echo "Application is healthy"
} else {
throw new Exception("Health check failed with status ${status}")
}
} catch (Exception e) {
checks++
if (checks < HEALTH_CHECK_RETRIES) {
echo "Health check attempt ${checks} failed, retrying..."
sleep(time: HEALTH_CHECK_DELAY, unit: 'SECONDS')
} else {
error "Health check failed after ${HEALTH_CHECK_RETRIES} attempts"
}
}
}
}
}
}
}
post {
failure {
script {
// Rollback on failure
try {
rollback()
} catch (Exception e) {
echo "Rollback failed: ${e.message}"
}
// Notify team
notifyFailure()
}
}
success {
script {
notifySuccess()
}
}
} }
def deploy() { // Deployment implementation sh ‘./deploy.sh’ }
def verifyDeployment() { // Deployment verification logic def status = sh( script: ‘./verify-deployment.sh’, returnStatus: true ) return status == 0 }
def rollback() { // Rollback implementation sh ‘./rollback.sh’ }
def notifyFailure() { // Failure notification implementation emailext( subject: “[${APP_NAME}] Deployment Failed”, body: “Deployment of ${APP_NAME} failed. Please check the build logs.”, to: ‘team@example.com’ ) }
def notifySuccess() { // Success notification implementation slackSend( channel: ‘#deployments’, color: ‘good’, message: “