Commit 23b3a5b8 authored by Mouhamadou Ba's avatar Mouhamadou Ba
Browse files

update

parent 7ec2afad
......@@ -12,7 +12,8 @@ rule get_PMCID:
doi_file = config["DOI_FILE"]
output:
pmcid_file = config["PMCID_FILE"],
doi_folder = directory("data/dois")
doi_folder = directory("data/dois"),
doi_not_found_file = ("data/not_found_dois.txt")
params:
request_root = config["PMC_REQUEST_ROOT"]
run:
......@@ -24,7 +25,8 @@ rule get_PMCID:
import pandas
import json
df = pandas.read_csv(input.doi_file)
data = []
found = []
not_found = []
for index, row in df.iterrows():
doi = row["doi"]
request = params.request_root + "search?query=DOI%3D"
......@@ -35,15 +37,21 @@ rule get_PMCID:
resp = requests.get(request, headers=headers)
binary = resp.content
Jdata = json.loads(binary)
#Jdata = resp.json()
pmcid = Jdata["resultList"]["result"][0]["fullTextIdList"]["fullTextId"][0]
data.append(pmcid)
Jlist = Jdata["resultList"]["result"]
if len(Jlist) and "fullTextIdList" in Jlist[0] and "fullTextId" in Jlist[0]["fullTextIdList"] and len(Jlist[0]["fullTextIdList"]["fullTextId"]):
pmcid = Jlist[0]["fullTextIdList"]["fullTextId"][0]
found.append(pmcid)
else:
not_found.append(doi)
filename = output.doi_folder + '/'+ str(doi) + '.json'
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w+') as f:
f.write(resp.text)
df = pandas.DataFrame(data, columns=['pmcid'])
df = pandas.DataFrame(found, columns=['pmcid'])
df.to_csv(output.pmcid_file, index=False)
pandas.DataFrame(not_found, columns=['doi']).to_csv(output.doi_not_found_file, index=False)
......
......@@ -3,27 +3,30 @@ configfile: "config/config.yaml"
rule all:
input:
dois = config["PMCID_FILE"],
pmcids = config["PMCID_FILE"],
corpus = config["CORPUS_FOLDER"]
rule get_PMCID:
input:
doi_file = config["PMID_FILE"]
pmid_file = config["PMID_FILE"]
output:
pmcid_file = config["PMCID_FILE"],
pmid_folder = directory("data/pmid")
pmid_folder = directory("data/pmid"),
pmid_not_found_file = ("data/not_found_pmids.txt")
params:
request_root = config["PMC_REQUEST_ROOT"]
run:
import io
import os
import requests
from requests.structures import CaseInsensitiveDict
from requests.utils import requote_uri
import pandas
import json
df = pandas.read_csv(input.doi_file)
data = []
df = pandas.read_csv(input.pmid_file)
found = []
not_found = []
for index, row in df.iterrows():
pmid = row["pmid"]
request = params.request_root + "search?query="
......@@ -34,21 +37,24 @@ rule get_PMCID:
resp = requests.get(request, headers=headers)
binary = resp.content
Jdata = json.loads(binary)
#Jdata = resp.json()
pmcid = Jdata["resultList"]["result"][0]["fullTextIdList"]["fullTextId"][0]
data.append(pmcid)
Jlist = Jdata["resultList"]["result"]
if len(Jlist) and "fullTextIdList" in Jlist[0] and "fullTextId" in Jlist[0]["fullTextIdList"] and len(Jlist[0]["fullTextIdList"]["fullTextId"]):
pmcid = Jlist[0]["fullTextIdList"]["fullTextId"][0]
found.append(pmcid)
else:
not_found.append(pmid)
filename = output.pmid_folder + '/'+ str(pmid) + '.json'
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w+') as f:
f.write(resp.text)
df = pandas.DataFrame(data, columns=['pmcid'])
df.to_csv(output.pmcid_file, index=False)
pandas.DataFrame(found, columns=['pmcid']).to_csv(output.pmcid_file, index=False)
pandas.DataFrame(not_found, columns=['pmid']).to_csv(output.pmid_not_found_file, index=False)
rule get_XML:
input:
doi_file = config["PMCID_FILE"]
pmcid_file = config["PMCID_FILE"]
output:
corpus_folder = directory(config["CORPUS_FOLDER"])
params:
......@@ -61,7 +67,7 @@ rule get_XML:
from requests.utils import requote_uri
import pandas
import json
df = pandas.read_csv(input.doi_file)
df = pandas.read_csv(input.pmcid_file)
data = []
for index, row in df.iterrows():
pmcid = row["pmcid"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment