pypi_scrape

get data from pythong package index
git clone git://popovic.xyz/pypi_scrape.git
Log | Files | Refs | LICENSE

sort_data.py (1731B)


      1 #/usr/bin/env python3.6
      2 
      3 import os
      4 import pandas as pd
      5 import numpy as np
      6 from datetime import datetime
      7 
      8 import warnings
      9 warnings.simplefilter(action='ignore', category=FutureWarning)
     10 
     11 def main():
     12     path = './data/'
     13     file_names =  sorted(os.listdir(path), key=lambda x: datetime.strptime(x, '%Y-%m.csv'))
     14     dframes = {}
     15 
     16     for fname in file_names:
     17        dframes[fname] = np.genfromtxt(path + fname, delimiter='|', dtype='str')[1:]
     18 
     19     cache_array = np.array([['package', 'requirement']], dtype='str')
     20 
     21     for i, fname in enumerate(file_names):
     22 
     23         print(fname, f'{round(i/len(file_names)*100, 1)}%    {len(cache_array)}', sep='\t')
     24 
     25         if i == 0:
     26             all_before_data = dframes[fname]
     27         else:
     28             all_before_data = np.vstack([dframes[_] for _ in file_names[:i]])
     29 
     30         for j, (package, requirement) in enumerate(dframes[fname]):
     31             if requirement != '':
     32                 if requirement not in all_before_data[:,0]:
     33                     cache_array = np.vstack([cache_array, [package, requirement]])
     34                     dframes[fname][:,1][j] = ''
     35 
     36             index_found = np.where(cache_array[:,1] == package)[0]
     37             if index_found.size != 0:
     38                 for i_found in index_found:
     39                     found_package, found_requirement = cache_array[i_found]
     40                     dframes[fname] = np.vstack([dframes[fname], [found_package, found_requirement]])
     41                 cache_array = np.delete(cache_array, index_found, axis=0)
     42 
     43 
     44         dframes[fname] = np.unique(dframes[fname], axis=0)
     45         pd.DataFrame(dframes[fname], columns=['package', 'requirement']).to_csv('./data_sorted/' + fname, sep='|', index=False)
     46 
     47 
     48 
     49 if __name__ == '__main__':
     50     main()