Commit 5ddff48d authored by Mouhamadou Ba's avatar Mouhamadou Ba
Browse files

adding snakefile and config

parent 6cfbe27e
# params value
DOIS_FILE : "data/dois.txt"
PMCID_FILE: "data/pmcids.txt"
PMID_FILE: "data/pmids.txt"
CORPUS_FOLDER: "data/corpus"
PMC_REQUEST_ROOT: "https://www.ebi.ac.uk/europepmc/webservices/rest/"
doi
https://doi.org/10.1371/journal.pone.0186766
https://doi.org/10.1016/j.dib.2020.105453
pmid
29253898
32300619
configfile: "config/config.yaml"
rule all:
input:
dois = config["PMCID_FILE"],
corpus = config["CORPUS_FOLDER"]
rule get_PMCID:
input:
doi_file = config["DOIS_FILE"]
output:
pmcid_file = config["PMCID_FILE"]
params:
request_root = config["PMC_REQUEST_ROOT"]
run:
import io
import requests
from requests.structures import CaseInsensitiveDict
from requests.utils import requote_uri
import pandas
import json
df = pandas.read_csv(input.doi_file)
data = []
for index, row in df.iterrows():
doi = row["doi"]
request = params.request_root + "search?query=DOI%3D"
request = request + doi
request = request + "&resultType=lite&cursorMark=*&pageSize=25&fromSearchPost=false"
headers = CaseInsensitiveDict()
headers["accept"] = "application/json"
resp = requests.get(request, headers=headers)
print(resp.status_code)
Jdata = resp.json()
print(Jdata)
pmcid = Jdata['resultList']['result'][0]
data.append(pmcid)
df = pandas.DataFrame(data, columns=['pmcid'])
df.to_csv(output.pmcid_file, index=False)
rule get_XML:
input:
doi_file = config["PMCID_FILE"]
output:
corpus_folder = directory(config["CORPUS_FOLDER"])
params:
request_root = config["PMC_REQUEST_ROOT"]
run:
import os
import io
import requests
from requests.structures import CaseInsensitiveDict
from requests.utils import requote_uri
import pandas
import json
df = pandas.read_csv(input.doi_file)
data = []
for index, row in df.iterrows():
pmcid = row["pmcid"]
request = params.request_root
request = request + pmcid
request = request + "/fullTextXML"
headers = CaseInsensitiveDict()
headers["accept"] = "application/xml"
resp = requests.get(request, headers=headers)
filename = output.corpus_folder + '/'+ pmcid + '.xml'
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w+') as f:
f.write(resp.text)
\ No newline at end of file
configfile: "config/config.yaml"
rule all:
input:
corpus = config["CORPUS_FOLDER"]
rule get_XML:
input:
doi_file = config["PMCID_FILE"]
output:
corpus_folder = directory(config["CORPUS_FOLDER"])
params:
request_root = config["PMC_REQUEST_ROOT"]
run:
import os
import io
import requests
from requests.structures import CaseInsensitiveDict
from requests.utils import requote_uri
import pandas
import json
df = pandas.read_csv(input.doi_file)
data = []
for index, row in df.iterrows():
pmcid = row["pmcid"]
request = params.request_root
request = request + pmcid
request = request + "/fullTextXML"
headers = CaseInsensitiveDict()
headers["accept"] = "application/xml"
resp = requests.get(request, headers=headers)
filename = output.corpus_folder + '/'+ pmcid + '.xml'
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w+') as f:
f.write(resp.text)
configfile: "config/config.yaml"
rule all:
input:
dois = config["PMCID_FILE"],
corpus = config["CORPUS_FOLDER"]
rule get_PMCID:
input:
doi_file = config["PMID_FILE"]
output:
pmcid_file = config["PMCID_FILE"]
params:
request_root = config["PMC_REQUEST_ROOT"]
run:
import io
import requests
from requests.structures import CaseInsensitiveDict
from requests.utils import requote_uri
import pandas
import json
df = pandas.read_csv(input.doi_file)
data = []
for index, row in df.iterrows():
pmid = row["pmid"]
request = params.request_root + "search?query="
request = request + str(pmid)
request = request + "&resultType=lite&cursorMark=*&pageSize=25&fromSearchPost=false"
headers = CaseInsensitiveDict()
headers["accept"] = "application/json"
resp = requests.get(request, headers=headers)
print(resp.status_code)
Jdata = resp.json()
print(Jdata)
pmcid = Jdata['resultList']['result'][0]
data.append(pmcid)
df = pandas.DataFrame(data, columns=['pmcid'])
df.to_csv(output.pmcid_file, index=False)
rule get_XML:
input:
doi_file = config["PMCID_FILE"]
output:
corpus_folder = directory(config["CORPUS_FOLDER"])
params:
request_root = config["PMC_REQUEST_ROOT"]
run:
import os
import io
import requests
from requests.structures import CaseInsensitiveDict
from requests.utils import requote_uri
import pandas
import json
df = pandas.read_csv(input.doi_file)
data = []
for index, row in df.iterrows():
pmcid = row["pmcid"]
request = params.request_root
request = request + pmcid
request = request + "/fullTextXML"
headers = CaseInsensitiveDict()
headers["accept"] = "application/xml"
resp = requests.get(request, headers=headers)
filename = output.corpus_folder + '/'+ pmcid + '.xml'
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w+') as f:
f.write(resp.text)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment