commit e67882729dd7cfb3377c77eb1e93743c92602258
parent 42d00cc8d82e8fdc53fd542a3853f121394dec2a
Author: miksa234 <milutin@popovic.xyz>
Date: Mon, 18 Apr 2022 15:27:52 +0200
sorting data when rquirement not yet released
Diffstat:
2 files changed, 47 insertions(+), 1 deletion(-)
diff --git a/get_dependecies.py b/get_dependecies.py
@@ -13,7 +13,7 @@ def main():
path = './data/'
files = {}
- for i, package in enumerate(packages):
+ for i, package in enumerate(packages[:100]):
try:
json = requests.get(url.format(package)).json()
except:
diff --git a/sort_data.py b/sort_data.py
@@ -0,0 +1,46 @@
+#/usr/bin/env python3.6
+
+import os
+import pandas as pd
+import numpy as np
+from datetime import datetime
+
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+
+def main():
+
+ path = './data/'
+ file_names = sorted(os.listdir(path), key=lambda x: datetime.strptime(x, '%Y-%m.csv'))
+ dframes = {}
+
+ for fname in file_names:
+ dframes[fname] = pd.read_csv(path + fname, sep='|', keep_default_na=False)
+ cache_list = [['package', 'requirement']]
+
+ for i, fname in enumerate(file_names):
+
+ print(fname, f'{round(i/len(file_names)*100, 1)}%', sep='\t')
+
+ if i == 0:
+ all_before_pd = dframes[fname]
+ else:
+ all_before_pd = pd.concat([dframes[_] for _ in file_names[:i]])
+
+ for j, (package, requirement) in enumerate(dframes[fname].to_numpy()):
+ if requirement != '':
+ if requirement not in all_before_pd['package'].to_list():
+ cache_list.append([package, requirement])
+ dframes[fname]['requirement'].iloc[j] = ''
+
+ if package in list(zip(*cache_list))[1]:
+ found_package, found_requirement = cache_list[list(zip(*cache_list))[1].index(package)]
+ dframes[fname].append({'package': found_package, 'requirement': found_requirement},\
+ ignore_index=True)
+ del cache_list[list(zip(*cache_list))[1].index(package)]
+
+ dframes[fname] = dframes[fname].drop_duplicates()
+ dframes[fname].to_csv('./data_sorted/' + fname, sep='|', index=False)
+
+if __name__ == '__main__':
+ main()