/** * \file /core/tandem_run/tandemcondorprocess.cpp * \date 5/9/2017 * \author Olivier Langella * \brief handles execution of a bunch of X!Tandem process throught condor job */ /******************************************************************************* * Copyright (c) 2017 Olivier Langella <olivier.langella@u-psud.fr>. * * This file is part of XTPcpp. * * XTPcpp is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * XTPcpp is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with XTPcpp. If not, see <http://www.gnu.org/licenses/>. * * Contributors: * Olivier Langella <olivier.langella@u-psud.fr> - initial API and implementation ******************************************************************************/ #include "tandemcondorprocess.h" #include <QDebug> #include <pappsomspp/pappsoexception.h> #include <QSettings> #include <QProcess> #include <QXmlSimpleReader> #include <QThread> #include "../../input/condorqxmlsaxhandler.h" TandemCondorProcess::TandemCondorProcess(MainWindow * p_main_window, WorkMonitorInterface * p_monitor, const TandemRunBatch & tandem_run_batch) : TandemBatchProcess(p_main_window,p_monitor, tandem_run_batch) { /* Universe = vanilla notification = Error Rank = Mips request_memory= 50000 request_cpus = 1 Executable = /usr/bin/tandem Log = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/submit_condor.log Output = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/tandem.$(Process).out Error = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/tandem.$(Process).error Arguments = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/xtandem_param2054956555872858570.xml Queue */ QSettings settings; QString condor_tmp_dir = QString("%1/xtpcpp").arg(settings.value("condor/tmp_dir", "/tmp").toString()); _p_tmp_dir = new QTemporaryDir(condor_tmp_dir); _p_tmp_dir->setAutoRemove(true); _condor_submit_command = settings.value("condor/submit", "/usr/bin/condor_submit").toString(); _condor_q_command = settings.value("condor/condor_q", "/usr/bin/condor_q").toString(); _condor_rm_command = settings.value("condor/condor_rm", "/usr/bin/condor_rm").toString(); _condor_request_memory = settings.value("condor/request_memory", "10000").toUInt(); if (!_p_tmp_dir->isValid()) { // dir.path() returns the unique directory path throw pappso::PappsoException(QObject::tr("problem creating condor temporary directory in %1\n").arg(condor_tmp_dir)); } } TandemCondorProcess::~TandemCondorProcess () { delete _p_tmp_dir; } unsigned int TandemCondorProcess::getCondorJobSize() const { return _condor_job_size; } void TandemCondorProcess::prepareXmlDatabaseFile() { QFile xml_database_file(QString("%1/database.xml").arg(_p_tmp_dir->path())); if (xml_database_file.open(QIODevice::WriteOnly)) { _xml_database_file = QFileInfo( xml_database_file.fileName()).absoluteFilePath(); QXmlStreamWriter * p_out = new QXmlStreamWriter(); p_out->setDevice(&xml_database_file); writeXmlDatabaseFile(p_out); xml_database_file.close(); delete p_out; } else { throw pappso::PappsoException(QObject::tr("error : cannot open the XML database file : %1\n").arg(xml_database_file.fileName())); } } void TandemCondorProcess::run() { qDebug() << "TandemCondorProcess::run begin "; QFileInfo preset_info(_tandem_run_batch._preset_file); _preset_file = QString("%1/%2").arg(_p_tmp_dir->path()).arg(preset_info.fileName()); QFile::copy(_tandem_run_batch._preset_file, _preset_file); _preset_file = _tandem_run_batch._preset_file; prepareXmlDatabaseFile(); //condor submit file : QFile submit_file(QString("%1/submit.txt").arg(_p_tmp_dir->path())); QTextStream * p_out = nullptr; if (submit_file.open(QIODevice::WriteOnly)) { p_out = new QTextStream(); p_out->setDevice(&submit_file); *p_out << "Universe = vanilla" << endl; *p_out << "notification = Error" << endl; *p_out << "Rank = Mips" << endl; *p_out << "request_memory= " << _condor_request_memory << endl; *p_out << "request_cpus = 1" << endl; *p_out << "Executable = " << _tandem_run_batch._tandem_bin_path << endl; *p_out << "Log = " << _p_tmp_dir->path() << "/condor.log" << endl; *p_out << "Output = " << _p_tmp_dir->path() << "/tandem.$(Process).out" << endl; *p_out << "Error = " << _p_tmp_dir->path() << "/tandem.$(Process).error" << endl; /* Log = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/submit_condor.log Output = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/tandem.$(Process).out Error = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/tandem.$(Process).error Arguments = /gorgone/pappso/tmp/temp_condor_job93294001891239208719639434471283743/xtandem_param2054956555872858570.xml Queue */ } else { throw pappso::PappsoException(QObject::tr("error : cannot open condor submit file : %1\n").arg(submit_file.fileName())); } std::vector<QTemporaryFile *> input_file_list; int i=0; _p_monitor->setProgressMaximumValue(_tandem_run_batch._mz_file_list.size()); for (QString mz_file : _tandem_run_batch._mz_file_list) { QTemporaryFile * p_xml_input_file = new QTemporaryFile(QString("%1/tandem").arg(_p_tmp_dir->path())); input_file_list.push_back(p_xml_input_file); p_xml_input_file->setAutoRemove(false); if (p_xml_input_file->open()) { QXmlStreamWriter * p_xml_out = new QXmlStreamWriter(); p_xml_out->setDevice(p_xml_input_file); writeXmlInputFile(p_xml_out, mz_file); p_xml_input_file->close(); delete p_xml_out; *p_out << "Arguments = " << QFileInfo( p_xml_input_file->fileName()).absoluteFilePath() << endl; *p_out << "Queue" << endl; } else { throw pappso::PappsoException(QObject::tr("error : cannot open the XML X!Tandem input file : %1\n").arg(p_xml_input_file->fileName())); } i++; } for (QTemporaryFile * p_xml_input_file: input_file_list) { delete p_xml_input_file; } if (p_out != nullptr) { submit_file.close(); delete p_out; } //now run condor job on submit_file QStringList arguments; arguments << QFileInfo( submit_file.fileName()).absoluteFilePath(); QProcess * condor_process = new QProcess(); //hk_process->setWorkingDirectory(QFileInfo(_hardklor_exe).absolutePath()); qDebug() << "TandemCondorProcess::run command " << _condor_submit_command << " " << arguments.join(" "); condor_process->start(_condor_submit_command, arguments); if (!condor_process->waitForStarted()) { throw pappso::PappsoException(QObject::tr("HTCondor X!Tandem process failed to start")); } if (!condor_process->waitForFinished(_max_xt_time_ms)) { throw pappso::PappsoException(QObject::tr("HTCondor X!Tandem process failed to finish")); } QString perr = condor_process->readAllStandardError(); if (perr.length()) { qDebug() << "TandemCondorProcess::run readAllStandardError " << perr; throw pappso::PappsoException(QObject::tr("HTCondor X!Tandem process failed :\n%1").arg(perr)); } else { qDebug() << "TandemCondorProcess::run readAllStandardError OK " << perr; } QString pjob = condor_process->readAllStandardOutput(); if (pjob.length()) { qDebug() << "TandemCondorProcess::run readAllStandardOutput OK " << pjob; } else { qDebug() << "TandemCondorProcess::run readAllStandardOutput " << pjob; throw pappso::PappsoException(QObject::tr("HTCondor X!Tandem process failed :\n%1").arg(pjob)); } //Submitting job(s).\n1 job(s) submitted to cluster 29.\n parseCondorJobNumber(pjob); _p_monitor->setProgressMaximumValue(_condor_job_size); qDebug() << "TandemCondorProcess::run job=" << _condor_cluster_number << " size=" << _condor_job_size; /* if (!xt_process->waitForFinished(_max_xt_time_ms)) { throw pappso::PappsoException(QObject::tr("can't wait for X!Tandem process to finish : timeout at %1").arg(_max_xt_time_ms)); } */ QByteArray result = condor_process->readAll(); QProcess::ExitStatus Status = condor_process->exitStatus(); qDebug() << "TandemCondorProcess::run ExitStatus " << Status << result.data(); if (Status != 0) { // != QProcess::NormalExit throw pappso::PappsoException(QObject::tr("error executing HTCondor Status != 0 : %1 %2\n%3").arg(_tandem_run_batch._tandem_bin_path).arg(arguments.join(" ").arg(result.data()))); } delete condor_process; surveyCondorJob(); qDebug() << "TandemCondorProcess::run end" ; } void TandemCondorProcess::surveyCondorJob() { //condor is running job : we have to survey condor job using "condor_q -xml _condor_cluster_number" bool keep_running = true; while(keep_running) { QThread::msleep(_condor_status_timer_millisecond); getCondorJobState(); _p_monitor->message(QObject::tr("X!Tandem is running in condor cluster %1\n%2 on %3 jobs completed").arg(_condor_cluster_number).arg(_condor_completed_jobs).arg(_condor_job_size), _condor_completed_jobs); if (_condor_completed_jobs == _condor_job_size) keep_running = false; if (shouldIstop()) { keep_running = false; //condor_rm condorRemoveJob(); throw pappso::PappsoException(QObject::tr("HTCondor X!Tandem jobs stopped by the user")); } } } void TandemCondorProcess::condorRemoveJob() { QStringList arguments; arguments << QString("%1").arg(_condor_cluster_number); QProcess condor_q_process; //hk_process->setWorkingDirectory(QFileInfo(_hardklor_exe).absolutePath()); qDebug() << "TandemCondorProcess::condorRemoveJob command " << _condor_rm_command << " " << arguments.join(" "); condor_q_process.start(_condor_rm_command, arguments); if (!condor_q_process.waitForStarted()) { throw pappso::PappsoException(QObject::tr("HTCondor condor_rm process failed to start")); } if (!condor_q_process.waitForFinished(_max_xt_time_ms)) { throw pappso::PappsoException(QObject::tr("HTCondor condor_rm process failed to finish")); } QString perr = condor_q_process.readAllStandardError(); if (perr.length()) { qDebug() << "TandemCondorProcess::condorRemoveJob readAllStandardError " << perr; throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed :\n%1").arg(perr)); } else { qDebug() << "TandemCondorProcess::condorRemoveJob readAllStandardError OK " << perr; } QString pjob = condor_q_process.readAllStandardOutput(); if (pjob.length()) { qDebug() << "TandemCondorProcess::condorRemoveJob readAllStandardOutput OK " << pjob; } else { qDebug() << "TandemCondorProcess::condorRemoveJob readAllStandardOutput " << pjob; throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed :\n%1").arg(pjob)); } } void TandemCondorProcess::getCondorJobState() { QStringList arguments; arguments << "-xml" << QString("%1").arg(_condor_cluster_number); QProcess condor_q_process; //hk_process->setWorkingDirectory(QFileInfo(_hardklor_exe).absolutePath()); qDebug() << "TandemCondorProcess::getCondorJobState command " << _condor_q_command << " " << arguments.join(" "); condor_q_process.start(_condor_q_command, arguments); if (!condor_q_process.waitForStarted()) { throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed to start")); } if (!condor_q_process.waitForFinished(_max_xt_time_ms)) { throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed to finish")); } QString perr = condor_q_process.readAllStandardError(); if (perr.length()) { qDebug() << "TandemCondorProcess::getCondorJobState readAllStandardError " << perr; throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed :\n%1").arg(perr)); } else { qDebug() << "TandemCondorProcess::getCondorJobState readAllStandardError OK " << perr; } QString pjob = condor_q_process.readAllStandardOutput(); if (pjob.length()) { qDebug() << "TandemCondorProcess::getCondorJobState readAllStandardOutput OK " << pjob; } else { qDebug() << "TandemCondorProcess::getCondorJobState readAllStandardOutput " << pjob; throw pappso::PappsoException(QObject::tr("HTCondor condor_q process failed :\n%1").arg(pjob)); } //Submitting job(s).\n1 job(s) submitted to cluster 29.\n parseCondorQueue(pjob); } void TandemCondorProcess::parseCondorQueue(QString & condor_q_xml) { CondorQxmlSaxHandler * parser = new CondorQxmlSaxHandler(this); QXmlSimpleReader simplereader; simplereader.setContentHandler(parser); simplereader.setErrorHandler(parser); qDebug() << "TandemCondorProcess::parseCondorQueue Read condor_q_xml"; QXmlInputSource xml_input_source; xml_input_source.setData(condor_q_xml); if (simplereader.parse(xml_input_source)) { } else { qDebug() << parser->errorString(); throw pappso::PappsoException(QObject::tr("Error reading condor_q xml string :\n %1\n%2").arg(parser->errorString()).arg(condor_q_xml)); } delete parser; } void TandemCondorProcess::parseCondorJobNumber(QString condor_job) { // Submitting job(s)... // 3 job(s) submitted to cluster 3. QRegExp txt_submit("([0-9]*) job\\(s\\) submitted to cluster ([0-9]*)."); if (txt_submit.indexIn(condor_job, 0) != -1) { _condor_cluster_number = txt_submit.cap(2).toUInt(); _condor_job_size = txt_submit.cap(1).toUInt(); } else { throw pappso::PappsoException(QObject::tr("unable to find HTCondor job numbers in %1").arg(condor_job)); } } void TandemCondorProcess::setCondorJobStatus(std::int8_t count_status[10]) { QString status_message = QString("%1 unexpanded jobs\n%2 idle jobs\n%3 running jobs\n%4 removed jobs\n%5 completed jobs\n%6 held jobs\n%7 submission errors").arg(count_status[0]).arg(count_status[1]).arg(count_status[2]).arg(count_status[3]).arg(count_status[4]).arg(count_status[5]).arg(count_status[6]); _p_monitor->setText(status_message); _condor_completed_jobs = count_status[(std::int8_t) CondorJobStatus::Completed]; }