View Javadoc

1   package fr.in2p3.jsaga.adaptor.job;
2   
3   import org.apache.log4j.Logger;
4   import org.globus.gram.*;
5   import org.ietf.jgss.GSSCredential;
6   import org.ietf.jgss.GSSException;
7   import org.ogf.saga.error.NoSuccessException;
8   
9   import java.util.Timer;
10  import java.util.TimerTask;
11  
12  /* ***************************************************
13   * *** Centre de Calcul de l'IN2P3 - Lyon (France) ***
14   * ***             http://cc.in2p3.fr/             ***
15   * ***************************************************
16   * File:   LCGCEJobMonitorWatchdog
17   * Author: Sylvain Reynaud (sreynaud@in2p3.fr)
18   * Date:   24 juin 2010
19   * ***************************************************
20   * Description:                                      */
21  /**
22   * todo: notify several JobService instances connected to the same LCG-CE with a single grid-monitor ?
23   * -> if several JobService instances in the same JVM ?     (yes: submit job with 'ps | grep dest-url')
24   * -> if several JobService instances in separate JVMs ?    (no)
25   * -> if several JobService instances on separate hosts ?   (no)
26   */
27  public class LCGCEJobMonitorWatchdog extends TimerTask {
28      private static final String RSL = "&(executable = /opt/globus/libexec/grid_monitor_lite.sh)" +
29              "(arguments = '--dest-url=https://134.158.71.194:9000/dev/stdout')";
30      private static final int WATCHDOG_PERIOD = 5*60*1000;
31  
32      private static Logger s_logger = Logger.getLogger(LCGCEJobMonitorListener.class);
33  
34      private GramJob m_gridMonitorJob;
35      private String m_serverUrl;
36      private Timer m_timer;
37  
38      public LCGCEJobMonitorWatchdog(GSSCredential cred, String host, int port) throws NoSuccessException {
39          m_gridMonitorJob = new GramJob(cred, RSL);
40          m_serverUrl = host+":"+port+"/jobmanager-fork";
41  
42          // start grid monitor
43          this.startMonitor();
44  
45          // start watchdog
46          m_timer = new Timer();
47          m_timer.schedule(this, WATCHDOG_PERIOD, WATCHDOG_PERIOD);
48      }
49  
50      public void stopAll() throws NoSuccessException {
51          // stop watchdog
52          m_timer.cancel();
53  
54          // stop grid monitor
55          this.stopMonitor();
56      }
57  
58      public void run() {
59          try {
60              if (this.isStopped()) {
61                  this.startMonitor();
62              } else {
63                  s_logger.info("Grid monitor is alived: "+m_gridMonitorJob.getIDAsString());
64              }
65          } catch (NoSuccessException e) {
66              s_logger.warn("Failed to start grid monitor", e);
67          }
68      }
69  
70      private void startMonitor() throws NoSuccessException {
71          try {
72              Gram.request(m_serverUrl, m_gridMonitorJob, false);
73              s_logger.info("Started grid monitor: "+m_gridMonitorJob.getIDAsString());
74          } catch (GramException e) {
75              throw new NoSuccessException(e);
76          } catch (GSSException e) {
77              throw new NoSuccessException(e);
78          }
79      }
80  
81      private void stopMonitor() throws NoSuccessException {
82          try {
83              Gram.cancel(m_gridMonitorJob);
84              s_logger.info("Stopped grid monitor: "+m_gridMonitorJob.getIDAsString());
85          } catch (GramException e) {
86              throw new NoSuccessException(e);
87          } catch (GSSException e) {
88              throw new NoSuccessException(e);
89          }
90      }
91  
92      private boolean isStopped() throws NoSuccessException {
93          try {
94              Gram.jobStatus(m_gridMonitorJob);
95          } catch (GramException e) {
96              if (e.getErrorCode() == GramException.ERROR_CONTACTING_JOB_MANAGER) {
97                  return true;    // job manager is stopped when status is DONE
98              } else {
99                  throw new NoSuccessException(e);
100             }
101         } catch (GSSException e) {
102             throw new NoSuccessException(e);
103         }
104         switch (m_gridMonitorJob.getStatus()) {
105             case GramJob.STATUS_DONE:
106             case GramJob.STATUS_FAILED:
107                 return true;
108             default:
109                 return false;
110         }
111     }
112 }