Error Handling and Resilience

Learning Objectives

  • Master pipeline error handling techniques
  • Implement robust error recovery mechanisms
  • Build resilient pipeline architectures
  • Develop effective monitoring and alerting strategies

Understanding Pipeline Failures

Common Pipeline Failure Points

  1. Build Failures
    • Compilation errors
    • Missing dependencies
    • Resource constraints
  2. Test Failures
    • Failed unit tests
    • Integration test issues
    • Performance test thresholds
  3. Deployment Issues
    • Environment configuration
    • Network problems
    • Service dependencies

Error Handling Techniques

1. Basic Error Handling

pipeline {
    agent any
    stages {
        stage('Build') {
            steps {
                script {
                    try {
                        sh 'mvn clean package'
                    } catch (Exception e) {
                        echo "Build failed: ${e.message}"
                        currentBuild.result = 'FAILURE'
                        error "Build stage failed"

2. Advanced Error Recovery

pipeline {
    agent any
    environment {
        MAX_RETRIES = 3
        RETRY_DELAY = 60 // seconds
    stages {
        stage('Deploy with Retry') {
            steps {
                script {
                    def retryCount = 0
                    def deployed = false
                    while (!deployed && retryCount < MAX_RETRIES) {
                        try {
                            deployed = true
                        } catch (Exception e) {
                            if (retryCount < MAX_RETRIES) {
                                echo "Deployment failed, attempt ${retryCount} of ${MAX_RETRIES}"
                            } else {
                                error "Deployment failed after ${MAX_RETRIES} attempts"

def deploy() {
    // Deployment logic here
    sh './'

3. Parallel Error Handling

pipeline {
    agent any
    stages {
        stage('Parallel Tests') {
            steps {
                script {
                    def testResults = [:]
                        'Unit Tests': {
                            try {
                                sh 'mvn test'
                                testResults['unit'] = 'SUCCESS'
                            } catch (Exception e) {
                                testResults['unit'] = 'FAILURE'
                                throw e
                        'Integration Tests': {
                            try {
                                sh 'mvn integration-test'
                                testResults['integration'] = 'SUCCESS'
                            } catch (Exception e) {
                                testResults['integration'] = 'FAILURE'
                                throw e
                        failFast: false
                    // Analysis of test results
                    def failedTests = testResults.findAll { it.value == 'FAILURE' }
                    if (failedTests) {
                        error "Tests failed: ${failedTests.keySet().join(', ')}"

Building Resilient Pipelines

1. Timeout Handling

pipeline {
    agent any
    options {
        timeout(time: 1, unit: 'HOURS')
    stages {
        stage('Long Running Process') {
            steps {
                timeout(time: 30, unit: 'MINUTES') {
                    script {
                        try {
                            sh './'
                        } catch (org.jenkinsci.plugins.workflow.steps.TimeoutStepExecution.ExceededTimeout e) {
                            echo "Process exceeded timeout"
                            // Cleanup or recovery actions
                            error "Process timed out"

2. Resource Management

pipeline {
    agent any
    environment {
        MAX_MEMORY = '4G'
        MAX_CPU = '2'
    stages {
        stage('Resource-Intensive Task') {
            steps {
                script {
                    try {
                        sh """
                            docker run \
                                --memory=${MAX_MEMORY} \
                                --cpus=${MAX_CPU} \
                    } catch (Exception e) {
                        if (e.message.contains('insufficient memory')) {
                            echo "Insufficient resources, attempting cleanup"
                            sh 'docker system prune -f'
                            error "Resource allocation failed"
                        } else {
                            throw e

3. Network Resilience

pipeline {
    agent any
    environment {
        CURL_RETRY = '5'
        CURL_RETRY_DELAY = '10'
        CURL_TIMEOUT = '30'
    stages {
        stage('API Integration') {
            steps {
                script {
                    def response = sh(
                        script: """
                            curl \
                                --retry ${CURL_RETRY} \
                                --retry-delay ${CURL_RETRY_DELAY} \
                                --max-time ${CURL_TIMEOUT} \
                                -s -w '%{http_code}' \
                        returnStdout: true
                    if (response != '200') {
                        error "API request failed with status ${response}"

Monitoring and Alerting

1. Pipeline Metrics

pipeline {
    agent any
    environment {
        BUILD_START_TIME = System.currentTimeMillis()
    stages {
        stage('Build and Test') {
            steps {
                script {
                    def stageStartTime = System.currentTimeMillis()
                    try {
                        sh 'mvn clean package'
                    } finally {
                        def duration = System.currentTimeMillis() - stageStartTime
                        recordMetric('build_duration', duration)
    post {
        always {
            script {
                def totalDuration = System.currentTimeMillis() - BUILD_START_TIME
                recordMetric('total_build_duration', totalDuration)
                // Send metrics to monitoring system

def recordMetric(String name, Object value) {
    echo "METRIC ${name}=${value}"
    // Implementation to store metric

def sendMetricsToPrometheus() {
    // Implementation to send metrics to Prometheus

2. Alert Integration

pipeline {
    agent any
    environment {
        SLACK_CHANNEL = '#deployments'
    stages {
        stage('Critical Operation') {
            steps {
                script {
                    try {
                    } catch (Exception e) {
                        throw e
    post {
        failure {
            script {
                // Slack notification
                    channel: SLACK_CHANNEL,
                    color: 'danger',
                    message: "Pipeline failed: ${currentBuild.fullDisplayName}"
                // PagerDuty incident
                    serviceKey: PAGERDUTY_SERVICE,
                    incidentKey: "${env.JOB_NAME}-${env.BUILD_NUMBER}",
                    description: "Pipeline failure in ${env.JOB_NAME}",
                    details: "Build URL: ${env.BUILD_URL}"

def performCriticalOperation() {
    // Critical operation implementation

def notifyFailure(Exception e) {
    def errorDetails = [
        job: env.JOB_NAME,
        build: env.BUILD_NUMBER,
        error: e.message,
        stage: env.STAGE_NAME,
        url: env.BUILD_URL
    // Send detailed error notification
        subject: "Pipeline Failure: ${env.JOB_NAME}",
        body: generateErrorReport(errorDetails),
        to: ''

def generateErrorReport(Map details) {
    // Generate detailed HTML error report
    return """
        <h2>Pipeline Failure Report</h2>
        <p>Job: ${details.job}</p>
        <p>Build: ${}</p>
        <p>Stage: ${details.stage}</p>
        <p>Error: ${details.error}</p>
        <p><a href="${details.url}">Build Details</a></p>

Best Practices

1. Error Prevention

  • Implement input validation
  • Use environment checks
  • Verify dependencies
  • Test error scenarios

2. Error Recovery

  • Implement graceful degradation
  • Use circuit breakers
  • Implement fallback mechanisms
  • Clean up resources

3. Monitoring

  • Track error rates
  • Monitor performance metrics
  • Set up alerts
  • Maintain audit logs


Exercise 1: Implement Resilient Deployment Pipeline

Create a pipeline that implements various resilience patterns:

```groovy pipeline { agent any

environment {
    APP_NAME = 'resilient-app'

stages {
    stage('Deploy') {
        steps {
            script {
                def deployed = false
                def attempts = 0
                while (!deployed && attempts < MAX_RETRIES) {
                    try {
                        // Deploy application
                        // Verify deployment
                        if (verifyDeployment()) {
                            deployed = true
                        } else {
                            throw new Exception("Deployment verification failed")
                    } catch (Exception e) {
                        if (attempts < MAX_RETRIES) {
                            echo "Deployment attempt ${attempts} failed, retrying..."
                            sleep(time: 30, unit: 'SECONDS')
                        } else {
                            error "Deployment failed after ${MAX_RETRIES} attempts"
    stage('Health Check') {
        steps {
            script {
                def healthy = false
                def checks = 0
                while (!healthy && checks < HEALTH_CHECK_RETRIES) {
                    try {
                        def status = sh(
                            script: "curl -s -w '%{http_code}' http://localhost:8080/health",
                            returnStdout: true
                        if (status == '200') {
                            healthy = true
                            echo "Application is healthy"
                        } else {
                            throw new Exception("Health check failed with status ${status}")
                    } catch (Exception e) {
                        if (checks < HEALTH_CHECK_RETRIES) {
                            echo "Health check attempt ${checks} failed, retrying..."
                            sleep(time: HEALTH_CHECK_DELAY, unit: 'SECONDS')
                        } else {
                            error "Health check failed after ${HEALTH_CHECK_RETRIES} attempts"

post {
    failure {
        script {
            // Rollback on failure
            try {
            } catch (Exception e) {
                echo "Rollback failed: ${e.message}"
            // Notify team
    success {
        script {
} }

def deploy() { // Deployment implementation sh ‘./’ }

def verifyDeployment() { // Deployment verification logic def status = sh( script: ‘./’, returnStatus: true ) return status == 0 }

def rollback() { // Rollback implementation sh ‘./’ }

def notifyFailure() { // Failure notification implementation emailext( subject: “[${APP_NAME}] Deployment Failed”, body: “Deployment of ${APP_NAME} failed. Please check the build logs.”, to: ‘’ ) }

def notifySuccess() { // Success notification implementation slackSend( channel: ‘#deployments’, color: ‘good’, message: “