Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Commit 4ae59df

Browse files
Merge pull request #336 from openclimatefix/copy-to-gcp
Copy to gcp
2 parents c75d136 + a6f0d5e commit 4ae59df

File tree

2 files changed

+74
-5
lines changed

2 files changed

+74
-5
lines changed

notebooks/copy_to_gcp.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
""" copy a folder from local to gcp """
2+
import logging
3+
from concurrent import futures
4+
5+
from nowcasting_dataset.filesystem.utils import get_all_filenames_in_path, upload_one_file
6+
7+
logging.basicConfig()
8+
_LOG = logging.getLogger("nowcasting_dataset")
9+
_LOG.setLevel(logging.DEBUG)
10+
11+
sets = ["train", "validation", "test"]
12+
data_sources = ["gsp", "metadata", "nwp", "pv", "satellite", "sun", "topographic"]
13+
14+
GCP_PATH = "gs://solar-pv-nowcasting-data/prepared_ML_training_data/v10"
15+
LOCAL_PATH = (
16+
"/mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/"
17+
"prepared_ML_training_data/v10"
18+
)
19+
20+
all_filenames = {}
21+
for dset in sets:
22+
for data_source in data_sources:
23+
dir = f"{LOCAL_PATH}/{dset}/{data_source}"
24+
gsp_dir = f"{GCP_PATH}/{dset}/{data_source}"
25+
files = get_all_filenames_in_path(dir)
26+
files = sorted(files)
27+
# get files already in gsp
28+
try:
29+
gsp_files_already = get_all_filenames_in_path(gsp_dir)
30+
except Exception:
31+
gsp_files_already = []
32+
# only get .nc files
33+
filenames = [file for file in files if ".nc" in file]
34+
gsp_files_already = [file for file in gsp_files_already if ".nc" in file]
35+
print(f"Already {len(gsp_files_already)} in gsp folder already: {gsp_dir}")
36+
37+
# remove file if already in gsp
38+
filenames = [
39+
file
40+
for file in filenames
41+
if f'{gsp_dir.replace("gs://","")}/{file.split("/")[-1]}' not in gsp_files_already
42+
]
43+
print(f"There are {len(filenames)} to upload")
44+
45+
files_dict = {file: f'{gsp_dir}/{file.split("/")[-1]}' for file in filenames}
46+
if len(filenames) > 0:
47+
all_filenames = {**all_filenames, **files_dict}
48+
49+
50+
def one_file(local_file, gsp_file):
51+
"""Copy one file from local to gsp"""
52+
# can use this index, only to copy files after a certain number
53+
file_index = int(local_file.split(".")[0][-6:])
54+
if file_index > -1:
55+
print(gsp_file)
56+
upload_one_file(remote_filename=gsp_file, local_filename=local_file, overwrite=False)
57+
58+
59+
# test to see if it works
60+
one_file(list(all_filenames.keys())[0], all_filenames[list(all_filenames.keys())[0]])
61+
62+
# loop over files
63+
with futures.ThreadPoolExecutor(max_workers=2) as executor:
64+
# Submit tasks to the executor.
65+
future_examples_per_source = []
66+
for k, v in all_filenames.items():
67+
task = executor.submit(one_file, local_file=k, gsp_file=v)
68+
future_examples_per_source.append(task)

nowcasting_dataset/filesystem/utils.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -142,20 +142,21 @@ def download_to_local(remote_filename: str, local_filename: str):
142142
filesystem.get(remote_filename, local_filename)
143143

144144

145-
def upload_one_file(
146-
remote_filename: str,
147-
local_filename: str,
148-
):
145+
def upload_one_file(remote_filename: str, local_filename: str, overwrite: bool = True):
149146
"""
150147
Upload one file to aws or gcp
151148
152149
Args:
153150
remote_filename: the aws/gcp key name
154151
local_filename: the local file name
152+
overwrite: overwrite file
155153
156154
"""
157155
filesystem = get_filesystem(remote_filename)
158-
filesystem.put(local_filename, remote_filename)
156+
if overwrite:
157+
filesystem.put(local_filename, remote_filename)
158+
elif ~filesystem.exists(remote_filename):
159+
filesystem.put(local_filename, remote_filename)
159160

160161

161162
def makedirs(path: Union[str, Path], exist_ok: bool = True) -> None:

0 commit comments

Comments
 (0)