Namo-R1/scripts/convert_vary.py
2025-02-22 12:25:49 +08:00

100 lines
2.8 KiB
Python

"""
sampling vary data
"""
import json
import os
import sys
import shutil
from PIL import Image
a = sys.argv[1]
def cn():
img_root = os.path.join(os.path.dirname(a), "pdf_cn_30w")
sample_img_root = os.path.join(os.path.dirname(a), "pdf_cn_30w_samples")
os.makedirs(sample_img_root, exist_ok=True)
res = json.load(open(a, "r"))
samples = []
for i, itm in enumerate(res):
img_f = os.path.join(img_root, itm["image"])
if not os.path.exists(img_f):
print(f"{img_f} not found")
if i < 100:
target_img_f = os.path.join(sample_img_root, itm["image"])
os.makedirs(os.path.dirname(target_img_f), exist_ok=True)
shutil.copy(img_f, target_img_f)
samples.append(itm)
print(f"done {len(res)}")
file_path = a.replace(".json", "_samples.json")
with open(file_path, "w") as f:
json.dump(samples, f, ensure_ascii=False, indent=2)
def en():
img_root = os.path.join(os.path.dirname(a), "pdf_en_30w")
sample_img_root = os.path.join(os.path.dirname(a), "pdf_en_30w_samples")
os.makedirs(sample_img_root, exist_ok=True)
res = json.load(open(a, "r"))
samples = []
new_res = []
for i, itm in enumerate(res):
img_f = os.path.join(img_root, itm["image"])
if not os.path.exists(img_f):
print(f"{img_f} not found")
continue
else:
new_res.append(itm)
if i < 100:
target_img_f = os.path.join(sample_img_root, itm["image"])
os.makedirs(os.path.dirname(target_img_f), exist_ok=True)
shutil.copy(img_f, target_img_f)
samples.append(itm)
print(f"done {len(res)}")
file_path = a.replace(".json", "_samples.json")
with open(file_path, "w") as f:
json.dump(samples, f, ensure_ascii=False, indent=2)
file_path = a.replace(".json", "_subset.json")
with open(file_path, "w") as f:
json.dump(new_res, f, ensure_ascii=False, indent=2)
print(f"done {len(new_res)}")
def cn_subset():
"""
choose those relatively smaller size images out
"""
img_root = os.path.join(os.path.dirname(a), "pdf_cn_30w")
# sample_img_root = os.path.join(os.path.dirname(a), 'pdf_cn_30w_samples')
# os.makedirs(sample_img_root, exist_ok=True)
res = json.load(open(a, "r"))
samples = []
for i, itm in enumerate(res):
img_f = os.path.join(img_root, itm["image"])
if not os.path.exists(img_f):
print(f"{img_f} not found")
image = Image.open(img_f)
if image.size[0] < 660 or image.size[1] < 660:
samples.append(itm)
print(f"done {len(res)}")
print(f"done {len(samples)}")
file_path = a.replace(".json", "_subset.json")
with open(file_path, "w") as f:
json.dump(samples, f, ensure_ascii=False, indent=2)
# en()
cn_subset()