fix

2023-09-13 18:15:40 +03:00 · 2023-09-13 18:15:40 +03:00 · 3b9a576290
commit 3b9a576290
parent d94a244bc4
1 changed files with 19 additions and 11 deletions
--- a/parser/collect.py
+++ b/parser/collect.py
@ -7,15 +7,23 @@ import os
 import requests
 import time
 from datetime import datetime, timedelta
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 from config import Parser
 token = Parser.GithubToken
 def query(payload, variables=None):
-    r = requests.post(
+    session = requests.Session()
    session.headers.update({'Authorization': f'bearer {token}'})
    retry = Retry(connect=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    r = session.post(
        'https://api.github.com/graphql',
        headers={'Authorization': f'bearer {token}'},
        json={"query": payload, "variables": variables or {}}
    )
    r.raise_for_status()
@ -80,13 +88,13 @@ def get_count(q):
 def scrape(q, out_file):
-    path = f'responses/{out_file}'
+    #path = f'responses/{out_file}'
-    if os.path.exists(path):
+    #if os.path.exists(path):
-        print('Skipping', path, 'already exists')
+    #    print('Skipping', path, 'already exists')
-        return
+    #    return
    all_repos = []
    cursor = None
-    print('Creating', path)
+    #print('Creating', path)
    while True:
        r = get_repos(q, cursor, 100)
        search = r['search']
@ -94,8 +102,8 @@ def scrape(q, out_file):
        cursor = pi['endCursor']
        has_next = pi['hasNextPage']
        total = search['repositoryCount']
-        if total > 2000:
+        #if total > 2000:
-            raise ValueError(f'Too many results for {q}: {total}')
+        #    raise ValueError(f'Too many results for {q}: {total}')
        all_repos += [e['node'] for e in search['edges']]
        print(r['rateLimit'])
        print(len(all_repos), ' / ', total, cursor)
@ -118,8 +126,8 @@ def scrape(q, out_file):
      collection.update_one(filter_dict, {"$set": entity}, upsert=True)
-    with open(path, 'w') as out:
+    #with open(path, 'w') as out:
-        print(out)
+        #print(out)
        #json.dump(all_repos, out)
    time.sleep(4)