Skip to content
Snippets Groups Projects
tandemcondorprocess.cpp 15.1 KiB
Newer Older
/**
 * \file /core/tandem_run/tandemcondorprocess.cpp
 * \date 5/9/2017
 * \author Olivier Langella
 * \brief handles execution of a bunch of X!Tandem process throught condor job
 */

/*******************************************************************************
* Copyright (c) 2017 Olivier Langella <olivier.langella@u-psud.fr>.
*
* This file is part of XTPcpp.
*
*     XTPcpp is free software: you can redistribute it and/or modify
*     it under the terms of the GNU General Public License as published by
*     the Free Software Foundation, either version 3 of the License, or
*     (at your option) any later version.
*
*     XTPcpp is distributed in the hope that it will be useful,
*     but WITHOUT ANY WARRANTY; without even the implied warranty of
*     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*     GNU General Public License for more details.
*
*     You should have received a copy of the GNU General Public License
*     along with XTPcpp.  If not, see <http://www.gnu.org/licenses/>.
*
* Contributors:
*     Olivier Langella <olivier.langella@u-psud.fr> - initial API and implementation
******************************************************************************/

Langella Olivier's avatar
Langella Olivier committed
#include "tandemcondorprocess.h"
#include <QDebug>
#include <pappsomspp/pappsoexception.h>
#include <QSettings>
#include <QProcess>
Langella Olivier's avatar
Langella Olivier committed
#include <QXmlSimpleReader>
#include <QThread>
#include "../../input/condorqxmlsaxhandler.h"
Langella Olivier's avatar
Langella Olivier committed
TandemCondorProcess::TandemCondorProcess(MainWindow * p_main_window, WorkMonitorInterface * p_monitor, const TandemRunBatch & tandem_run_batch) : TandemBatchProcess(p_main_window,p_monitor, tandem_run_batch) {
    /*
    Universe   = vanilla
    notification   = Error
    Rank       = Mips
    request_memory= 50000
    request_cpus = 1
    Executable = /usr/bin/tandem
    Log        = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/submit_condor.log
    Output        = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/tandem.$(Process).out
    Error        = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/tandem.$(Process).error

    Arguments = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/xtandem_param2054956555872858570.xml
    Queue
    */

    QSettings settings;
Olivier Langella's avatar
Olivier Langella committed
    QString condor_tmp_dir = QString("%1/xtpcpp").arg(settings.value("condor/tmp_dir", "/tmp").toString());
    _p_tmp_dir = new QTemporaryDir(condor_tmp_dir);
Olivier Langella's avatar
Olivier Langella committed
    _p_tmp_dir->setAutoRemove(true);
    _condor_submit_command = settings.value("condor/submit", "/usr/bin/condor_submit").toString();
Langella Olivier's avatar
Langella Olivier committed
    _condor_q_command = settings.value("condor/condor_q", "/usr/bin/condor_q").toString();
Langella Olivier's avatar
Langella Olivier committed
    _condor_rm_command = settings.value("condor/condor_rm", "/usr/bin/condor_rm").toString();
Langella Olivier's avatar
Langella Olivier committed
    _condor_request_memory = settings.value("condor/request_memory", "10000").toUInt();


    if (!_p_tmp_dir->isValid()) {
        // dir.path() returns the unique directory path
        throw pappso::PappsoException(QObject::tr("problem creating condor temporary directory in %1\n").arg(condor_tmp_dir));
    }


}

TandemCondorProcess::~TandemCondorProcess () {
    delete _p_tmp_dir;
}
unsigned int TandemCondorProcess::getCondorJobSize() const {
    return _condor_job_size;
}

void TandemCondorProcess::prepareXmlDatabaseFile() {

    QFile xml_database_file(QString("%1/database.xml").arg(_p_tmp_dir->path()));

    if (xml_database_file.open(QIODevice::WriteOnly))
    {
        _xml_database_file = QFileInfo( xml_database_file.fileName()).absoluteFilePath();
        QXmlStreamWriter * p_out = new QXmlStreamWriter();
        p_out->setDevice(&xml_database_file);
        writeXmlDatabaseFile(p_out);
        xml_database_file.close();
        delete p_out;
    } else
    {
        throw pappso::PappsoException(QObject::tr("error : cannot open the XML database file : %1\n").arg(xml_database_file.fileName()));
    }

Langella Olivier's avatar
Langella Olivier committed
void TandemCondorProcess::run() {
Olivier Langella's avatar
Olivier Langella committed
    qDebug() << "TandemCondorProcess::run begin ";
    QFileInfo preset_info(_tandem_run_batch._preset_file);
    _preset_file = QString("%1/%2").arg(_p_tmp_dir->path()).arg(preset_info.fileName());
    QFile::copy(_tandem_run_batch._preset_file, _preset_file);
    _preset_file = _tandem_run_batch._preset_file;

    prepareXmlDatabaseFile();


    //condor submit file :
    QFile submit_file(QString("%1/submit.txt").arg(_p_tmp_dir->path()));
    QTextStream * p_out = nullptr;

    if (submit_file.open(QIODevice::WriteOnly))
    {
        p_out = new QTextStream();
        p_out->setDevice(&submit_file);

        *p_out <<     "Universe   = vanilla" << endl;
        *p_out <<     "notification   = Error" << endl;
        *p_out <<     "Rank       = Mips" << endl;
Langella Olivier's avatar
Langella Olivier committed
        *p_out <<     "request_memory= " << _condor_request_memory << endl;
        *p_out <<     "request_cpus = 1" << endl;
        *p_out <<     "Executable = " << _tandem_run_batch._tandem_bin_path << endl;
Olivier Langella's avatar
Olivier Langella committed
        *p_out <<     "Log        = " << _p_tmp_dir->path() << "/condor.log" << endl;
        *p_out <<     "Output        = " << _p_tmp_dir->path() << "/tandem.$(Process).out" << endl;
        *p_out <<     "Error        = " << _p_tmp_dir->path() << "/tandem.$(Process).error" << endl;
        /*
        Log        = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/submit_condor.log
        Output        = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/tandem.$(Process).out
        Error        = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/tandem.$(Process).error

        Arguments = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/xtandem_param2054956555872858570.xml
        Queue
        */
    } else
    {
        throw pappso::PappsoException(QObject::tr("error : cannot open condor submit file : %1\n").arg(submit_file.fileName()));
    }



    std::vector<QTemporaryFile *> input_file_list;

    int i=0;
    _p_monitor->setProgressMaximumValue(_tandem_run_batch._mz_file_list.size());
    for (QString mz_file : _tandem_run_batch._mz_file_list) {


Olivier Langella's avatar
Olivier Langella committed
        QTemporaryFile *  p_xml_input_file = new QTemporaryFile(QString("%1/tandem").arg(_p_tmp_dir->path()));

        input_file_list.push_back(p_xml_input_file);
        p_xml_input_file->setAutoRemove(false);
        if (p_xml_input_file->open())
        {
            QXmlStreamWriter * p_xml_out = new QXmlStreamWriter();
            p_xml_out->setDevice(p_xml_input_file);
            writeXmlInputFile(p_xml_out, mz_file);

            p_xml_input_file->close();
            delete p_xml_out;


Olivier Langella's avatar
Olivier Langella committed
            *p_out <<     "Arguments        = " << QFileInfo( p_xml_input_file->fileName()).absoluteFilePath() << endl;
            *p_out <<     "Queue" << endl;
        } else
        {
            throw pappso::PappsoException(QObject::tr("error : cannot open the XML X!Tandem input file : %1\n").arg(p_xml_input_file->fileName()));
        }
        i++;
    }
    for (QTemporaryFile *  p_xml_input_file: input_file_list) {
        delete p_xml_input_file;
    }


    if (p_out != nullptr) {
        submit_file.close();
        delete p_out;
    }

    //now run condor job on submit_file

    QStringList arguments;

    arguments << QFileInfo( submit_file.fileName()).absoluteFilePath();

    QProcess * condor_process = new QProcess();
    //hk_process->setWorkingDirectory(QFileInfo(_hardklor_exe).absolutePath());
Olivier Langella's avatar
Olivier Langella committed
    qDebug() << "TandemCondorProcess::run command " << _condor_submit_command << " " << arguments.join(" ");
    condor_process->start(_condor_submit_command, arguments);


    if (!condor_process->waitForStarted()) {
        throw pappso::PappsoException(QObject::tr("HTCondor X!Tandem process failed to start"));
    }

Olivier Langella's avatar
Olivier Langella committed
    if (!condor_process->waitForFinished(_max_xt_time_ms)) {
        throw pappso::PappsoException(QObject::tr("HTCondor X!Tandem process failed to finish"));
    }

    QString perr = condor_process->readAllStandardError();
    if (perr.length()) {

        qDebug() << "TandemCondorProcess::run readAllStandardError " << perr;
        throw pappso::PappsoException(QObject::tr("HTCondor X!Tandem process failed :\n%1").arg(perr));
    }
    else {
        qDebug() << "TandemCondorProcess::run readAllStandardError OK " << perr;
    }

    QString pjob = condor_process->readAllStandardOutput();
    if (pjob.length()) {
        qDebug() << "TandemCondorProcess::run readAllStandardOutput OK " << pjob;
    }
    else {
        qDebug() << "TandemCondorProcess::run readAllStandardOutput " << pjob;
        throw pappso::PappsoException(QObject::tr("HTCondor X!Tandem process failed :\n%1").arg(pjob));
Olivier Langella's avatar
Olivier Langella committed

    //Submitting job(s).\n1 job(s) submitted to cluster 29.\n
    parseCondorJobNumber(pjob);
Olivier Langella's avatar
Olivier Langella committed
    _p_monitor->setProgressMaximumValue(_condor_job_size);
    qDebug() << "TandemCondorProcess::run job=" << _condor_cluster_number << " size=" << _condor_job_size;


    /*
    if (!xt_process->waitForFinished(_max_xt_time_ms)) {
        throw pappso::PappsoException(QObject::tr("can't wait for X!Tandem process to finish : timeout at %1").arg(_max_xt_time_ms));
    }
    */
    QByteArray result = condor_process->readAll();

    QProcess::ExitStatus Status = condor_process->exitStatus();

Olivier Langella's avatar
Olivier Langella committed
    qDebug() << "TandemCondorProcess::run ExitStatus " << Status << result.data();
    if (Status != 0)
    {
        // != QProcess::NormalExit
        throw pappso::PappsoException(QObject::tr("error executing HTCondor Status != 0 : %1 %2\n%3").arg(_tandem_run_batch._tandem_bin_path).arg(arguments.join(" ").arg(result.data())));
    }

    delete condor_process;
Olivier Langella's avatar
Olivier Langella committed


    surveyCondorJob();

    qDebug() << "TandemCondorProcess::run end" ;
Olivier Langella's avatar
Olivier Langella committed

void TandemCondorProcess::surveyCondorJob() {

Langella Olivier's avatar
Langella Olivier committed
    //condor is running job : we have to survey condor job using "condor_q -xml _condor_cluster_number"
    bool keep_running = true;
    while(keep_running) {
Langella Olivier's avatar
Langella Olivier committed

        QThread::msleep(_condor_status_timer_millisecond);
        getCondorJobState();
        _p_monitor->message(QObject::tr("X!Tandem is running in condor cluster %1\n%2 on %3 jobs completed").arg(_condor_cluster_number).arg(_condor_completed_jobs).arg(_condor_job_size), _condor_completed_jobs);
        
        if (_condor_completed_jobs == _condor_job_size) keep_running = false;
        
Langella Olivier's avatar
Langella Olivier committed
        if (shouldIstop()) {
            keep_running = false;
Langella Olivier's avatar
Langella Olivier committed
            //condor_rm
            condorRemoveJob();
            throw pappso::PappsoException(QObject::tr("HTCondor X!Tandem jobs stopped by the user"));
        }
    }
}
void TandemCondorProcess::condorRemoveJob() {

    QStringList arguments;

    arguments << QString("%1").arg(_condor_cluster_number);

    QProcess condor_q_process;
    //hk_process->setWorkingDirectory(QFileInfo(_hardklor_exe).absolutePath());
    qDebug() << "TandemCondorProcess::condorRemoveJob command " << _condor_rm_command << " " << arguments.join(" ");
    condor_q_process.start(_condor_rm_command, arguments);


    if (!condor_q_process.waitForStarted()) {
        throw pappso::PappsoException(QObject::tr("HTCondor condor_rm process failed to start"));
    }

    if (!condor_q_process.waitForFinished(_max_xt_time_ms)) {
        throw pappso::PappsoException(QObject::tr("HTCondor condor_rm process failed to finish"));
    }

    QString perr = condor_q_process.readAllStandardError();
    if (perr.length()) {

        qDebug() << "TandemCondorProcess::condorRemoveJob readAllStandardError " << perr;
        throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed :\n%1").arg(perr));
    }
    else {
        qDebug() << "TandemCondorProcess::condorRemoveJob readAllStandardError OK " << perr;
    }

    QString pjob = condor_q_process.readAllStandardOutput();
    if (pjob.length()) {
        qDebug() << "TandemCondorProcess::condorRemoveJob readAllStandardOutput OK " << pjob;
Langella Olivier's avatar
Langella Olivier committed
    else {
        qDebug() << "TandemCondorProcess::condorRemoveJob readAllStandardOutput " << pjob;
        throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed :\n%1").arg(pjob));
    }

Langella Olivier's avatar
Langella Olivier committed
}
void TandemCondorProcess::getCondorJobState() {
Langella Olivier's avatar
Langella Olivier committed
    QStringList arguments;
Langella Olivier's avatar
Langella Olivier committed
    arguments << "-xml" << QString("%1").arg(_condor_cluster_number);
Langella Olivier's avatar
Langella Olivier committed
    QProcess condor_q_process;
    //hk_process->setWorkingDirectory(QFileInfo(_hardklor_exe).absolutePath());
    qDebug() << "TandemCondorProcess::getCondorJobState command " << _condor_q_command << " " << arguments.join(" ");
    condor_q_process.start(_condor_q_command, arguments);
Langella Olivier's avatar
Langella Olivier committed
    if (!condor_q_process.waitForStarted()) {
        throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed to start"));
Langella Olivier's avatar
Langella Olivier committed
    if (!condor_q_process.waitForFinished(_max_xt_time_ms)) {
        throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed to finish"));
Langella Olivier's avatar
Langella Olivier committed
    QString perr = condor_q_process.readAllStandardError();
    if (perr.length()) {
Langella Olivier's avatar
Langella Olivier committed
        qDebug() << "TandemCondorProcess::getCondorJobState readAllStandardError " << perr;
        throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed :\n%1").arg(perr));
Langella Olivier's avatar
Langella Olivier committed
    else {
        qDebug() << "TandemCondorProcess::getCondorJobState readAllStandardError OK " << perr;
Langella Olivier's avatar
Langella Olivier committed
    QString pjob = condor_q_process.readAllStandardOutput();
    if (pjob.length()) {
        qDebug() << "TandemCondorProcess::getCondorJobState readAllStandardOutput OK " << pjob;
    }
    else {
        qDebug() << "TandemCondorProcess::getCondorJobState readAllStandardOutput " << pjob;
        throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed :\n%1").arg(pjob));
Langella Olivier's avatar
Langella Olivier committed
    //Submitting job(s).\n1 job(s) submitted to cluster 29.\n
    parseCondorQueue(pjob);
}

void TandemCondorProcess::parseCondorQueue(QString & condor_q_xml) {


    CondorQxmlSaxHandler * parser = new CondorQxmlSaxHandler(this);

    QXmlSimpleReader simplereader;
    simplereader.setContentHandler(parser);
    simplereader.setErrorHandler(parser);

    qDebug() << "TandemCondorProcess::parseCondorQueue Read condor_q_xml";

    QXmlInputSource xml_input_source;
    xml_input_source.setData(condor_q_xml);

    if (simplereader.parse(xml_input_source)) {
    } else {
        qDebug() << parser->errorString();

        throw pappso::PappsoException(QObject::tr("Error reading condor_q xml string :\n %1\n%2").arg(parser->errorString()).arg(condor_q_xml));
Langella Olivier's avatar
Langella Olivier committed
    delete parser;

Olivier Langella's avatar
Olivier Langella committed
}


void TandemCondorProcess::parseCondorJobNumber(QString condor_job) {
    // Submitting job(s)...
    // 3 job(s) submitted to cluster 3.
    QRegExp txt_submit("([0-9]*) job\\(s\\) submitted to cluster ([0-9]*).");

    if (txt_submit.indexIn(condor_job, 0) != -1) {
        _condor_cluster_number = txt_submit.cap(2).toUInt();
        _condor_job_size = txt_submit.cap(1).toUInt();
    }
    else {
        throw pappso::PappsoException(QObject::tr("unable to find HTCondor job numbers in %1").arg(condor_job));
    }
}


void TandemCondorProcess::setCondorJobStatus(std::int8_t count_status[10]) {

    QString status_message = QString("%1 unexpanded jobs\n%2 idle jobs\n%3 running jobs\n%4 removed jobs\n%5 completed jobs\n%6 held jobs\n%7 submission errors").arg(count_status[0]).arg(count_status[1]).arg(count_status[2]).arg(count_status[3]).arg(count_status[4]).arg(count_status[5]).arg(count_status[6]);
    
    _p_monitor->setText(status_message);
    
    _condor_completed_jobs = count_status[(std::int8_t) CondorJobStatus::Completed];