Solaris Volume Manager Administration Guide
Previous Next

Monitoring Solaris Volume Manager With a cron Job

How to Automate Checking for Errors in Volumes

  • To automatically check your Solaris Volume Manager configuration for errors, create a script that the cron utility can periodically run.

    The following example shows a script that you can adapt and modify for your needs.


    Note - This script serves as a starting point for automating error checking for Solaris Volume Manager. You probably need to modify this script for your own configuration.


    #
    #!/bin/ksh
    #ident "@(#)metacheck.sh   1.3     96/06/21 SMI"
    # ident='%Z%%M%   %I%     %E% SMI'
    #
    # Copyright (c) 1999 by Sun Microsystems, Inc.
    #
    # metacheck
    #
    # Check on the status of the metadevice configuration.  If there is a problem
    # return a non zero exit code.  Depending on options, send email notification.
    #
    # -h
    #    help
    # -s setname
    #    Specify the set to check.  By default, the 'local' set will be checked.
    # -m recipient [recipient...]
    #    Send email notification to the specified recipients.  This
    #    must be the last argument. The notification shows up as a short 
    #    email message with a subject of 
    #        "Solaris Volume Manager Problem: metacheck.who.nodename.setname"
    #    which summarizes the problem(s) and tells how to obtain detailed 
    #    information. The "setname" is from the -s option, "who" is from 
    #    the -w option, and "nodename" is reported by uname(1).
    #    Email notification is further affected by the following options:
    #        -f    to suppress additional messages after a problem 
    #            has been found. 
    #        -d    to control the supression.
    #        -w    to identify who generated the email.
    #        -t    to force email even when there is no problem.
    # -w who
    #    indicate who is running the command. By default, this is the
    #    user-name as reported by id(1M). This is used when sending
    #    email notification (-m).
    # -f 
    #    Enable filtering.  Filtering applies to email notification (-m).
    #    Filtering requires root permission.  When sending email notification
    #    the file /etc/lvm/metacheck.setname.pending is used to 
    #    controll the filter.  The following matrix specifies the behavior
    #    of the filter:
    #
    #    problem_found    file_exists
    #      yes          no        Create file, send notification
    #      yes          yes        Resend notification if the current date 
    #                    (as specified by -d datefmt) is 
    #                    different than the file date.
    #      no          yes        Delete file, send notification 
    #                    that the problem is resolved.
    #      no          no        Send notification if -t specified.
    #    
    # -d datefmt
    #    Specify the format of the date for filtering (-f).  This option 
    #    controls the how often re-notification via email occurs. If the 
    #    current date according to the specified format (strftime(3C)) is 
    #    identical to the date contained in the 
    #    /etc/lvm/metacheck.setname.pending file then the message is 
    #    suppressed. The default date format is "%D", which will send one 
    #    re-notification per day.
    # -t
    #    Test mode.  Enable email generation even when there is no problem.
    #    Used for end-to-end verification of the mechanism and email addresses.
    #    
    #
    # These options are designed to allow integration of metacheck
    # into crontab.  For example, a root crontab entry of:
    #
    # 0,15,30,45 * * * * /usr/sbin/metacheck -f -w SVMcron \
    #   -d '\%D \%h' -m notice@example.com 2148357243.8333033@pager.example.com
    #
    # would check for problems every 15 minutes, and generate an email to
    # notice@example.com (and send to an email pager service) every hour when 
    # there is a problem.  Note the \ prior to the '%' characters for a 
    # crontab entry.  Bounced email would come back to root@nodename.
    # The subject line for email generated by the above line would be
    # Solaris Volume Manager Problem: metacheck.SVMcron.nodename.local
    #
    
    # display a debug line to controlling terminal (works in pipes)
    decho()
    {
        if [ "$debug" = "yes" ] ; then
        echo "DEBUG: $*"    < /dev/null > /dev/tty 2>&1
        fi
    }
    
    # if string $1 is in $2-* then return $1, else return ""
    strstr()
    {
        typeset    look="$1"
        typeset    ret=""
    
        shift
    #   decho "strstr LOOK .$look. FIRST .$1."
        while [ $# -ne 0 ] ; do
        if [ "$look" = "$1" ] ; then
            ret="$look"
        fi
        shift
        done
        echo "$ret"
    }
    
    # if string $1 is in $2-* then delete it. return result
    strdstr()
    {
        typeset    look="$1"
        typeset    ret=""
    
        shift
    #   decho "strdstr LOOK .$look. FIRST .$1."
        while [ $# -ne 0 ] ; do
        if [ "$look" != "$1" ] ; then
            ret="$ret $1"
        fi
        shift
        done
        echo "$ret"
    }
    
    merge_continued_lines()
    {
        awk -e '\
        BEGIN { line = "";} \
        $NF == "\\" { \
            $NF = ""; \
            line = line $0; \
            next; \
        } \
        $NF != "\\" { \
            if ( line != "" ) { \
            print line $0; \
            line = ""; \
            } else { \
            print $0; \
            } \
        }'
    }
    
    # trim out stuff not associated with metadevices
    find_meta_devices()
    {
        typeset    devices=""
    
    #   decho "find_meta_devices .$*."
        while [ $# -ne 0 ] ; do
        case $1 in
        d+([0-9]) )    # metadevice name
            devices="$devices $1"
            ;;
        esac
        shift
        done
        echo "$devices"
    }
    
    # return the list of top level metadevices
    toplevel()
    {
        typeset    comp_meta_devices=""
        typeset    top_meta_devices=""
        typeset    devices=""
        typeset    device=""
        typeset    comp=""
    
        metastat$setarg -p | merge_continued_lines | while read line ; do
        echo "$line"
        devices=`find_meta_devices $line`
        set -- $devices
        if [ $# -ne 0 ] ; then
            device=$1
            shift
            # check to see if device already refered to as component
            comp=`strstr $device $comp_meta_devices`
            if [ -z $comp ] ; then 
            top_meta_devices="$top_meta_devices $device"
            fi
            # add components to component list, remove from top list
            while [ $# -ne 0 ] ; do
            comp=$1
            comp_meta_devices="$comp_meta_devices $comp"
            top_meta_devices=`strdstr $comp $top_meta_devices`
            shift
            done
        fi
        done > /dev/null 2>&1
        echo $top_meta_devices
    }
    
    #
    # - MAIN
    #
    METAPATH=/usr/sbin
    PATH=//usr/bin:$METAPATH
    USAGE="usage: metacheck [-s setname] [-h] [[-t] [-f [-d datefmt]] \
        [-w who] -m recipient [recipient...]]"
    
    datefmt="%D"
    debug="no"
    filter="no"
    mflag="no"
    set="local"
    setarg=""
    testarg="no"
    who=`id | sed -e 's/^uid=[0-9][0-9]*(//' -e 's/).*//'`
    
    while getopts d:Dfms:tw: flag
    do
        case $flag in
        d)    datefmt=$OPTARG;
        ;;
        D)    debug="yes"
        ;;
        f)    filter="yes"
        ;;
        m)    mflag="yes"
        ;;
        s)    set=$OPTARG;
        if [ "$set" != "local" ] ; then
            setarg=" -s $set";
        fi
        ;;
        t)    testarg="yes";
        ;;
        w)    who=$OPTARG;
        ;;
        \?)    echo $USAGE
        exit 1
        ;;
        esac
    done
    
    # if mflag specified then everything else part of recipient
    shift `expr $OPTIND - 1`
    if [ $mflag = "no" ] ; then
        if [ $# -ne 0 ] ; then 
        echo $USAGE
        exit 1
        fi
    else
        if [ $# -eq 0 ] ; then 
        echo $USAGE
        exit 1
        fi
    fi
    recipients="$*"
    
    curdate_filter=`date +$datefmt`
    curdate=`date`
    node=`uname -n`
    
    # establish files
    msg_f=/tmp/metacheck.msg.$$
    msgs_f=/tmp/metacheck.msgs.$$
    metastat_f=/tmp/metacheck.metastat.$$
    metadb_f=/tmp/metacheck.metadb.$$
    metahs_f=/tmp/metacheck.metahs.$$
    pending_f=/etc/lvm/metacheck.$set.pending 
    files="$metastat_f $metadb_f $metahs_f $msg_f $msgs_f"
    
    rm -f $files                            > /dev/null 2>&1
    trap "rm -f $files > /dev/null 2>&1; exit 1" 1 2 3 15
    
    # Check to see if metadb is capable of running
    have_metadb="yes"
    metadb$setarg                             > $metadb_f 2>&1
    if [ $? -ne 0 ] ; then
        have_metadb="no"
    fi
    grep "there are no existing databases"      < $metadb_f    > /dev/null 2>&1
    if [ $? -eq 0 ] ; then
        have_metadb="no"
    fi
    grep "/dev/md/admin"                < $metadb_f    > /dev/null 2>&1
    if [ $? -eq 0 ] ; then
        have_metadb="no"
    fi
    
    # check for problems accessing metadbs
    retval=0
    if [ "$have_metadb" = "no" ] ; then
        retval=1
        echo "metacheck: metadb problem, can't run '$METAPATH/metadb$setarg'" \
                                    >> $msgs_f
    else
        # snapshot the state
        metadb$setarg 2>&1 | sed -e '1d' | merge_continued_lines    > $metadb_f
        metastat$setarg 2>&1 | merge_continued_lines        > $metastat_f
        metahs$setarg -i 2>&1 | merge_continued_lines        > $metahs_f
    
        #
        # Check replicas for problems, capital letters in the flags
        # indicate an error, fields are seperated by tabs.
        #
        problem=`awk < $metadb_f -F\t '{if ($1 ~ /[A-Z]/) print $1;}'`
        if [ -n "$problem" ] ; then
        retval=`expr $retval + 64`
        echo "\
    metacheck: metadb problem, for more detail run:\n\t$METAPATH/metadb$setarg -i" \
                                    >> $msgs_f
        fi
    
        #
        # Check the metadevice state
        #
        problem=`awk < $metastat_f -e \
            '/State:/ {if ($2 != "Okay" && $2 != "Resyncing") print $0;}'`
        if [ -n "$problem" ] ; then
        retval=`expr $retval + 128`
        echo "\
    metacheck: metadevice problem, for more detail run:" \
                                    >> $msgs_f
    
        # refine the message to toplevel metadevices that have a problem
        top=`toplevel`
        set -- $top
        while [ $# -ne 0 ] ; do
            device=$1
            problem=`metastat $device | awk -e \
            '/State:/ {if ($2 != "Okay" && $2 != "Resyncing") print $0;}'`
            if [ -n "$problem" ] ; then
            echo "\t$METAPATH/metastat$setarg $device"    >> $msgs_f
            # find out what is mounted on the device
            mp=`mount|awk -e '/\/dev\/md\/dsk\/'$device'[ \t]/{print $1;}'`
            if [ -n "$mp" ] ; then
                echo "\t\t$mp mounted on $device"        >> $msgs_f
            fi
            fi
            shift
        done
        fi
    
        #
        # Check the hotspares to see if any have been used.
        #
        problem=""
        grep "no hotspare pools found"    < $metahs_f        > /dev/null 2>&1
        if [ $? -ne 0 ] ; then
        problem=`awk < $metahs_f -e \
            '/blocks/ { if ( $2 != "Available" ) print $0;}'`
        fi
        if [ -n "$problem" ] ; then
        retval=`expr $retval + 256`
        echo "\
    metacheck: hot spare in use, for more detail run:\n\t$METAPATH/metahs$setarg -i" \
                                     >> $msgs_f
        fi
    fi
    
    # If any errors occurred, then mail the report
    if [ $retval -ne 0 ] ; then
        if [ -n "$recipients" ] ; then 
        re=""
        if [ -f $pending_f ] && [ "$filter" = "yes" ] ; then
            re="Re: "
            # we have a pending notification, check date to see if we resend
            penddate_filter=`cat $pending_f | head -1`
            if [ "$curdate_filter" != "$penddate_filter" ] ; then
            rm -f $pending_f                > /dev/null 2>&1
            else
             if [ "$debug" = "yes" ] ; then
                echo "metacheck: email problem notification still pending"
                cat $pending_f
            fi
            fi
        fi
        if [ ! -f $pending_f ] ; then
            if [ "$filter" = "yes" ] ; then
            echo "$curdate_filter\n\tDate:$curdate\n\tTo:$recipients" \
                                    > $pending_f
            fi
            echo "\
    Solaris Volume Manager: $node: metacheck$setarg: Report: $curdate"        >> $msg_f
            echo "\
    --------------------------------------------------------------" >> $msg_f
            cat $msg_f $msgs_f | mailx -s \
            "${re}Solaris Volume Manager Problem: metacheck.$who.$set.$node" $recipients
        fi
        else
        cat $msgs_f
        fi
    else
        # no problems detected,
        if [ -n "$recipients" ] ; then
        # default is to not send any mail, or print anything.
        echo "\
    Solaris Volume Manager: $node: metacheck$setarg: Report: $curdate"        >> $msg_f
        echo "\
    --------------------------------------------------------------" >> $msg_f
        if [ -f $pending_f ] && [ "$filter" = "yes" ] ; then
            # pending filter exista, remove it and send OK
            rm -f $pending_f                    > /dev/null 2>&1
            echo "Problem resolved"                >> $msg_f
            cat $msg_f | mailx -s \
            "Re: Solaris Volume Manager Problem: metacheck.$who.$node.$set" $recipients
        elif [ "$testarg" = "yes" ] ; then
            # for testing, send mail every time even thought there is no problem
            echo "Messaging test, no problems detected"        >> $msg_f
            cat $msg_f | mailx -s \
            "Solaris Volume Manager Problem: metacheck.$who.$node.$set" $recipients
        fi
        else
        echo "metacheck: Okay"
        fi
    fi
    
    rm -f $files                            > /dev/null 2>&1
    exit $retval

    For information on invoking scripts by using the cron utility, see the cron(1M) man page.

Previous Next