Skip to content

Commit 8630f8d

Browse files
Merge pull request #476 from scholarly-python-package/develop
Release v1.7.7
2 parents f5b24dd + 7d2d028 commit 8630f8d

4 files changed

Lines changed: 28 additions & 16 deletions

File tree

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ bibtexparser
44
deprecated
55
fake_useragent
66
free-proxy
7+
httpx
78
python-dotenv
89
requests[socks]
910
selenium

scholarly/_navigator.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import random
1212
import time
1313
from requests.exceptions import Timeout
14+
from httpx import TimeoutException
1415
from selenium.webdriver.common.by import By
1516
from .publication_parser import _SearchScholarIterator
1617
from .author_parser import AuthorParser
@@ -111,7 +112,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
111112
w = random.uniform(1,2)
112113
time.sleep(w)
113114
resp = session.get(pagerequest, timeout=timeout)
114-
self.logger.debug("Session proxy config is {}".format(session.proxies))
115+
self.logger.debug("Session proxy config is {}".format(pm._proxies))
115116

116117
has_captcha = self._requests_has_captcha(resp.text)
117118

@@ -149,7 +150,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
149150
self.logger.info("Will retry after %.2f seconds (with the same session).", w)
150151
time.sleep(w)
151152
continue
152-
except Timeout as e:
153+
except (Timeout, TimeoutException) as e:
153154
err = "Timeout Exception %s while fetching page: %s" % (type(e).__name__, e.args)
154155
self.logger.info(err)
155156
if timeout < 3*self._TIMEOUT:
@@ -164,7 +165,7 @@ def _get_page(self, pagerequest: str, premium: bool = False) -> str:
164165

165166
tries += 1
166167
try:
167-
session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=session.proxies.get('http', None))
168+
session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout, old_proxy=pm._proxies.get('http', None))
168169
except Exception:
169170
self.logger.info("No other secondary connections possible. "
170171
"Using the primary proxy for all requests.")

scholarly/_proxy_generator.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
import time
66
import requests
7+
import httpx
78
import tempfile
89
import urllib3
910

@@ -43,6 +44,7 @@ def __init__(self):
4344
# If we use a proxy or Tor, we set this to True
4445
self._proxy_works = False
4546
self.proxy_mode = None
47+
self._proxies = {}
4648
# If we have a Tor server that we can refresh, we set this to True
4749
self._tor_process = None
4850
self._can_refresh_tor = False
@@ -183,8 +185,12 @@ def _use_proxy(self, http: str, https: str = None) -> bool:
183185
"""
184186
if https is None:
185187
https = http
188+
if http[:4] != "http":
189+
http = "http://" + http
190+
if https[:5] != "https":
191+
https = "https://" + https
186192

187-
proxies = {'http': http, 'https': https}
193+
proxies = {'http://': http, 'https://': https}
188194
if self.proxy_mode == ProxyMode.SCRAPERAPI:
189195
r = requests.get("http://api.scraperapi.com/account", params={'api_key': self._API_KEY}).json()
190196
if "error" in r:
@@ -198,7 +204,7 @@ def _use_proxy(self, http: str, https: str = None) -> bool:
198204
self._proxy_works = self._check_proxy(proxies)
199205

200206
if self._proxy_works:
201-
self._session.proxies = proxies
207+
self._proxies = proxies
202208
self._new_session()
203209

204210
return self._proxy_works
@@ -353,8 +359,8 @@ def _get_webdriver(self):
353359
def _get_chrome_webdriver(self):
354360
if self._proxy_works:
355361
webdriver.DesiredCapabilities.CHROME['proxy'] = {
356-
"httpProxy": self._session.proxies['http'],
357-
"sslProxy": self._session.proxies['https'],
362+
"httpProxy": self._proxies['http'],
363+
"sslProxy": self._proxies['https'],
358364
"proxyType": "MANUAL"
359365
}
360366

@@ -369,8 +375,8 @@ def _get_firefox_webdriver(self):
369375
if self._proxy_works:
370376
# Redirect webdriver through proxy
371377
webdriver.DesiredCapabilities.FIREFOX['proxy'] = {
372-
"httpProxy": self._session.proxies['http'],
373-
"sslProxy": self._session.proxies['https'],
378+
"httpProxy": self._proxies['http'],
379+
"sslProxy": self._proxies['https'],
374380
"proxyType": "MANUAL",
375381
}
376382

@@ -439,11 +445,12 @@ def _handle_captcha2(self, url):
439445
return self._session
440446

441447
def _new_session(self):
448+
init_kwargs = {}
442449
proxies = {}
443450
if self._session:
444-
proxies = self._session.proxies
451+
proxies = self._proxies
445452
self._close_session()
446-
self._session = requests.Session()
453+
# self._session = httpx.Client()
447454
self.got_403 = False
448455

449456
# Suppress the misleading traceback from UserAgent()
@@ -453,15 +460,18 @@ def _new_session(self):
453460
'accept': 'text/html,application/xhtml+xml,application/xml',
454461
'User-Agent': UserAgent().random,
455462
}
456-
self._session.headers.update(_HEADERS)
463+
# self._session.headers.update(_HEADERS)
464+
init_kwargs.update(headers=_HEADERS)
457465

458466
if self._proxy_works:
459-
self._session.proxies = proxies
467+
init_kwargs["proxies"] = proxies #.get("http", None)
468+
self._proxies = proxies
460469
if self.proxy_mode is ProxyMode.SCRAPERAPI:
461470
# SSL Certificate verification must be disabled for
462471
# ScraperAPI requests to work.
463472
# https://www.scraperapi.com/documentation/
464-
self._session.verify = False
473+
init_kwargs["verify"] = False
474+
self._session = httpx.Client(**init_kwargs)
465475
self._webdriver = None
466476

467477
return self._session
@@ -496,7 +506,7 @@ def _fp_coroutine(self, timeout=1, wait_time=120):
496506
all_proxies = freeproxy.get_proxy_list()
497507
if proxy in self._dirty_freeproxies:
498508
continue
499-
proxies = {'http': proxy, 'https': proxy}
509+
proxies = {'http://': proxy, 'https://': proxy}
500510
proxy_works = self._check_proxy(proxies)
501511
if proxy_works:
502512
dirty_proxy = (yield proxy)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name='scholarly',
8-
version='1.7.6',
8+
version='1.7.7',
99
author='Steven A. Cholewiak, Panos Ipeirotis, Victor Silva, Arun Kannawadi',
1010
author_email='steven@cholewiak.com, panos@stern.nyu.edu, vsilva@ualberta.ca, arunkannawadi@astro.princeton.edu',
1111
description='Simple access to Google Scholar authors and citations',

0 commit comments

Comments
 (0)