From 3b9a576290f3df2f147871701e52c98e1d4168b0 Mon Sep 17 00:00:00 2001 From: Sviatoslav Tsariov Date: Wed, 13 Sep 2023 18:15:40 +0300 Subject: [PATCH] fix --- parser/collect.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/parser/collect.py b/parser/collect.py index 0fb9d66..902165e 100644 --- a/parser/collect.py +++ b/parser/collect.py @@ -7,15 +7,23 @@ import os import requests import time from datetime import datetime, timedelta +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry from config import Parser token = Parser.GithubToken def query(payload, variables=None): - r = requests.post( + session = requests.Session() + session.headers.update({'Authorization': f'bearer {token}'}) + retry = Retry(connect=3, backoff_factor=0.5) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + + r = session.post( 'https://api.github.com/graphql', - headers={'Authorization': f'bearer {token}'}, json={"query": payload, "variables": variables or {}} ) r.raise_for_status() @@ -80,13 +88,13 @@ def get_count(q): def scrape(q, out_file): - path = f'responses/{out_file}' - if os.path.exists(path): - print('Skipping', path, 'already exists') - return + #path = f'responses/{out_file}' + #if os.path.exists(path): + # print('Skipping', path, 'already exists') + # return all_repos = [] cursor = None - print('Creating', path) + #print('Creating', path) while True: r = get_repos(q, cursor, 100) search = r['search'] @@ -94,8 +102,8 @@ def scrape(q, out_file): cursor = pi['endCursor'] has_next = pi['hasNextPage'] total = search['repositoryCount'] - if total > 2000: - raise ValueError(f'Too many results for {q}: {total}') + #if total > 2000: + # raise ValueError(f'Too many results for {q}: {total}') all_repos += [e['node'] for e in search['edges']] print(r['rateLimit']) print(len(all_repos), ' / ', total, cursor) @@ -118,8 +126,8 @@ def scrape(q, out_file): collection.update_one(filter_dict, {"$set": entity}, upsert=True) - with open(path, 'w') as out: - print(out) + #with open(path, 'w') as out: + #print(out) #json.dump(all_repos, out) time.sleep(4)