pypi_scrape

get data from pythong package index
git clone git://popovic.xyz/pypi_scrape.git
Log | Files | Refs

commit e67882729dd7cfb3377c77eb1e93743c92602258
parent 42d00cc8d82e8fdc53fd542a3853f121394dec2a
Author: miksa234 <milutin@popovic.xyz>
Date:   Mon, 18 Apr 2022 15:27:52 +0200

sorting data when rquirement not yet released

Diffstat:
Mget_dependecies.py | 2+-
Asort_data.py | 46++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/get_dependecies.py b/get_dependecies.py @@ -13,7 +13,7 @@ def main(): path = './data/' files = {} - for i, package in enumerate(packages): + for i, package in enumerate(packages[:100]): try: json = requests.get(url.format(package)).json() except: diff --git a/sort_data.py b/sort_data.py @@ -0,0 +1,46 @@ +#/usr/bin/env python3.6 + +import os +import pandas as pd +import numpy as np +from datetime import datetime + +import warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +def main(): + + path = './data/' + file_names = sorted(os.listdir(path), key=lambda x: datetime.strptime(x, '%Y-%m.csv')) + dframes = {} + + for fname in file_names: + dframes[fname] = pd.read_csv(path + fname, sep='|', keep_default_na=False) + cache_list = [['package', 'requirement']] + + for i, fname in enumerate(file_names): + + print(fname, f'{round(i/len(file_names)*100, 1)}%', sep='\t') + + if i == 0: + all_before_pd = dframes[fname] + else: + all_before_pd = pd.concat([dframes[_] for _ in file_names[:i]]) + + for j, (package, requirement) in enumerate(dframes[fname].to_numpy()): + if requirement != '': + if requirement not in all_before_pd['package'].to_list(): + cache_list.append([package, requirement]) + dframes[fname]['requirement'].iloc[j] = '' + + if package in list(zip(*cache_list))[1]: + found_package, found_requirement = cache_list[list(zip(*cache_list))[1].index(package)] + dframes[fname].append({'package': found_package, 'requirement': found_requirement},\ + ignore_index=True) + del cache_list[list(zip(*cache_list))[1].index(package)] + + dframes[fname] = dframes[fname].drop_duplicates() + dframes[fname].to_csv('./data_sorted/' + fname, sep='|', index=False) + +if __name__ == '__main__': + main()