fixed where chached items would only be printed once per found repository - pypi_scrape

commit 584d9835c1e3b649920cc0bd08cd81928de688c1
parent 3d6ef55aaeb4ff183c6e3d98fd117b997da46b62
Author: miksa234 <milutin@popovic.xyz>
Date:   Tue, 19 Apr 2022 10:59:07 +0200

fixed where chached items would only be printed once per found repository

Diffstat:
M sort_data.py  | 12 +++++++-----

1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/sort_data.py b/sort_data.py
@@ -18,7 +18,7 @@ def main():
        dframes[fname] = pd.read_csv(path + fname, sep='|', keep_default_na=False)
     cache_list = [['package', 'requirement']]
 
-    for i, fname in enumerate(file_names):
+    for i, fname in enumerate(file_names[:10]):
 
         print(fname, f'{round(i/len(file_names)*100, 1)}%', sep='\t')
 
@@ -33,11 +33,13 @@ def main():
                     cache_list.append([package, requirement])
                     dframes[fname]['requirement'].iloc[j] = ''
 
-            if package in list(zip(*cache_list))[1]:
-                found_package, found_requirement = cache_list[list(zip(*cache_list))[1].index(package)]
-                dframes[fname].append({'package': found_package, 'requirement': found_requirement},\
+            if (index_found := np.where(np.array(cache_list, dtype='object')[:,1] == package)[0]).size != 0:
+                for i_found in index_found:
+                    found_package, found_requirement = cache_list[i_found]
+                    dframes[fname].append({'package': found_package, 'requirement': found_requirement},\
                                       ignore_index=True)
-                del cache_list[list(zip(*cache_list))[1].index(package)]
+                    del cache_list[i_found]
+
 
         dframes[fname] = dframes[fname].drop_duplicates()
         dframes[fname].to_csv('./data_sorted/' + fname, sep='|', index=False)

	pypi_scrape get data from pythong package index
	git clone git://popovic.xyz/pypi_scrape.git
	Log \| Files \| Refs