progetti:htcondor-tf:htcondor-ce_accounting
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revision | |||
progetti:htcondor-tf:htcondor-ce_accounting [2019/05/10 09:42] – dalpra@infn.it | progetti:htcondor-tf:htcondor-ce_accounting [2019/07/10 11:52] (current) – dalpra@infn.it | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ===== Accounting with HTCondor-CE (Draft) ===== | ||
+ | A good thing with HTCondor-CE is that accounting can be greatly simplified by the fact that job history files are self consistent, as they contain both grid and batch data: thus, there is no need for blah records, nor to search for matches from distinct sets of grid and batch usage records. | ||
+ | |||
+ | To enable job history files creation one has to define on each schedd | ||
+ | * '' | ||
+ | |||
+ | then there will be a HTCondor job history file for each finished job. Each row in the file has the format: '' | ||
+ | Using python parsing the file into a python dictionary is almost straightforward: | ||
+ | |||
+ | < | ||
+ | def jobfile2dict(fn): | ||
+ | try: | ||
+ | f = open(fn,' | ||
+ | except IOError: | ||
+ | return {} | ||
+ | return dict([map(str.strip, | ||
+ | </ | ||
+ | |||
+ | It is sufficient to read the '' | ||
+ | the job history file is moved to another folder, to prevent doublecounting. | ||
+ | |||
+ | See below an example script performing just that. Note that it extract more '' | ||
+ | such as the job exit status and the hostname of the Worker Node, which can be used to lookup the HS06 power of the machine and obtain a punctual WallClkockTime normalization. | ||
+ | |||
+ | < | ||
+ | # | ||
+ | |||
+ | import os, sys, time, json | ||
+ | import psycopg2 | ||
+ | from socket import getfqdn | ||
+ | |||
+ | myfqdn = getfqdn() | ||
+ | myhn = myfqdn.split(' | ||
+ | myconf = '/ | ||
+ | mylog = '/ | ||
+ | |||
+ | """ | ||
+ | Example configuration file | ||
+ | [root@ce02-htc ~]# cat / | ||
+ | { | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | }, | ||
+ | " | ||
+ | " | ||
+ | }, | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | } | ||
+ | } | ||
+ | |||
+ | """ | ||
+ | |||
+ | def mlog(msg, logfile = mylog): | ||
+ | """ | ||
+ | """ | ||
+ | f = open(logfile, | ||
+ | f.write(" | ||
+ | f.flush() | ||
+ | f.close() | ||
+ | |||
+ | try: | ||
+ | f = open(myconf,' | ||
+ | cnf = json.load(f) | ||
+ | except Exception, | ||
+ | mlog(" | ||
+ | sys.exit(1) | ||
+ | |||
+ | mylog = cnf[' | ||
+ | f.close() | ||
+ | |||
+ | def help(): | ||
+ | print "Usage example: python htc_gratiajobs.py" | ||
+ | print "parse HTCondor history job files from $(PER_JOB_HISTORY_DIR)" | ||
+ | print " | ||
+ | sys.exit(0) | ||
+ | |||
+ | def now(): | ||
+ | return time.ctime(time.time()) | ||
+ | |||
+ | class dbconn(): | ||
+ | def __init__(self, | ||
+ | self.conn = psycopg2.connect(database=database, | ||
+ | self.curs = self.conn.cursor() | ||
+ | |||
+ | conndict = cnf[' | ||
+ | qc = dbconn(**conndict) | ||
+ | | ||
+ | dt2sec = lambda x : x.days * 86400 + x.seconds | ||
+ | idf = lambda x : x | ||
+ | thishn = lambda x : myhn | ||
+ | |||
+ | # cfr. manual: condor-V8_8_1-Manual/ | ||
+ | # ExitStatus: | ||
+ | # The way that HTCondor previously dealt with a job's exit status. This attribute should no longer be used. It is not always accurate in heterogeneous pools, or if the job exited with a signal. Instead, see the attributes: ExitBySignal, | ||
+ | |||
+ | #APEL: takes the following keys: | ||
+ | # | ||
+ | |||
+ | d = {' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | ' | ||
+ | } | ||
+ | |||
+ | jdir = cnf[' | ||
+ | |||
+ | def jobfile2dict(fn): | ||
+ | try: | ||
+ | f = open(fn,' | ||
+ | except IOError: | ||
+ | return {} | ||
+ | return dict([map(str.strip, | ||
+ | |||
+ | def cleanval(v): | ||
+ | if v is None: | ||
+ | return v | ||
+ | if v and v[0] == v[-1] == '"' | ||
+ | return v[1:-1] | ||
+ | try: | ||
+ | return int(float(v)) | ||
+ | except ValueError, | ||
+ | return v | ||
+ | |||
+ | getwn = lambda s: s and s.split(' | ||
+ | |||
+ | #use os.scandir() iterator on python3 | ||
+ | F = os.listdir(jdir) | ||
+ | if not os.path.isdir(cnf[' | ||
+ | os.mkdir(cnf[' | ||
+ | |||
+ | if not os.path.isdir(cnf[' | ||
+ | os.mkdir(cnf[' | ||
+ | |||
+ | K = d.keys() | ||
+ | sq = """ | ||
+ | for n,fn in enumerate(F): | ||
+ | if not fn.startswith(' | ||
+ | jf = os.path.join(jdir, | ||
+ | jd = jobfile2dict(jf) | ||
+ | dtup = dict([(k, | ||
+ | df = {} | ||
+ | for k,v in dtup.items(): | ||
+ | df[k] = cleanval(v) | ||
+ | df[' | ||
+ | df[' | ||
+ | df[' | ||
+ | df[' | ||
+ | for k in [' | ||
+ | df[k] = df[k] or 0 | ||
+ | print n, | ||
+ | qc.curs.execute(sq, | ||
+ | os.rename(jf, | ||
+ | |||
+ | qc.conn.commit() | ||
+ | qc.curs.close() | ||
+ | qc.conn.close() | ||
+ | |||
+ | for fn in os.listdir(cnf[' | ||
+ | os.rename(os.path.join(cnf[' | ||
+ | |||
+ | |||
+ | |||
+ | </ |
progetti/htcondor-tf/htcondor-ce_accounting.txt · Last modified: 2019/07/10 11:52 by dalpra@infn.it