-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathdatasets.py
executable file
·206 lines (181 loc) · 5.85 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
# License: GNU Affero General Public License v3 or later
# A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt.
pulling data from google drive dataset with simulated or synthetic communities
"""
import gdown
import os
import sys
import logging
import pandas as pd
from autometa.common.utilities import internet_is_connected
logger = logging.getLogger(__name__)
def download(
community_type: str, community_sizes: list, file_names: list, dir_path: str
) -> None:
"""Downloads the files specified in a dictionary.
Parameters
----------
community_type : str
specifies the type of dataset that the user would like to download from
community_sizes : list
specifies the size of dataset that the user would like to download
file_names : list
specifies the file(s) that the user would like to download
dir_path : str
output path where the user wants to download the file(s)
Returns
-------
None
download is completed through gdown
"""
if community_type == "synthetic" or community_type == "all":
raise NotImplementedError
# points to csv file on google drive
df = pd.read_csv(
"https://drive.google.com/uc?id=148fUO7jocoNOBUl2K4bCfjsbd42QxCzX",
dtype=str,
index_col=["dataset", "file"],
)
for community_size in community_sizes:
community_size_outdir = os.path.join(dir_path, community_size)
# make a new directory
if not os.path.exists(community_size_outdir):
os.makedirs(community_size_outdir)
for file_name in file_names:
file_id = df.loc[(community_size, file_name), "file_id"]
file_id_filepath = os.path.join(community_size_outdir, file_name)
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, file_id_filepath)
def main():
import argparse
import logging as logger
logger.basicConfig(
format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
level=logger.DEBUG,
)
parser = argparse.ArgumentParser(
description="Download a simulated community file from google drive to a specified output directory"
)
parser.add_argument(
"--community-type",
help="specify synthetic or simulated communities (currently only simulated is available)",
choices=[
"synthetic",
"simulated",
"all",
],
required=True,
)
parser.add_argument(
"--community-sizes",
help="specify a community size to download from",
choices=[
"78Mbp",
"156Mbp",
"312Mbp",
"625Mbp",
"1250Mbp",
"2500Mbp",
"5000Mbp",
"10000Mbp",
"all",
],
required=True,
nargs="+",
)
parser.add_argument(
"--file-names",
help="specify a file name to download",
choices=[
"README.md",
"reference_assignments.tsv.gz",
"metagenome.fna.gz",
"master.tsv.gz",
"control_reads.tsv.gz",
"control_contigs.tsv.gz",
"unclustered_recruitment.tsv.gz",
"binning.tsv.gz",
"taxonomy.tsv.gz",
"lengths.tsv.gz",
"coverages.tsv.gz",
"gc_content.tsv.gz",
"kmers.embedded.tsv.gz",
"kmers.tsv.gz",
"markers.tsv.gz",
"Bacteria.fna.gz",
"orfs.faa.gz",
"metagenome.filtered.fna.gz",
"hmmscan.tsv.gz",
"forward_reads.fastq.gz",
"reverse_reads.fastq.gz",
"all",
],
nargs="+",
required=True,
)
parser.add_argument(
"--dir-path",
help="specify a folder to start the download (several directories will be generated within this folder)",
required=True,
)
parser.add_argument(
"--host",
help="IP address to ping when checking internet connectivity. Note: Will attempt to connect to port 53 on host address (Default is google.com)",
default="8.8.8.8",
)
args = parser.parse_args()
if "all" in args.community_sizes:
community_sizes = (
"78Mbp",
"156Mbp",
"312Mbp",
"625Mbp",
"1250Mbp",
"2500Mbp",
"5000Mbp",
"10000Mbp",
)
else:
community_sizes = args.community_sizes
if "all" in args.file_names:
file_names = (
"README.md",
"reference_assignments.tsv.gz",
"metagenome.fna.gz",
"master.tsv.gz",
"control_reads.tsv.gz",
"control_contigs.tsv.gz",
"unclustered_recruitment.tsv.gz",
"binning.tsv.gz",
"taxonomy.tsv.gz",
"lengths.tsv.gz",
"coverages.tsv.gz",
"gc_content.tsv.gz",
"kmers.embedded.tsv.gz",
"kmers.tsv.gz",
"markers.tsv.gz",
"Bacteria.fna.gz",
"orfs.faa.gz",
"metagenome.filtered.fna.gz",
"hmmscan.tsv.gz",
"forward_reads.fastq.gz",
"reverse_reads.fastq.gz",
)
else:
file_names = args.file_names
if not internet_is_connected(host=args.host):
logger.error(
"No internet connection detected (couldn't ping google.com at IP 8.8.8.8). Please confirm connection. Downloader will still attempt to run. (Ping a custom IP address with --host argument)"
)
download(
community_type=args.community_type,
community_sizes=community_sizes,
file_names=file_names,
dir_path=args.dir_path,
)
if __name__ == "__main__":
main()