→コード: v4.1.5 _get_tweet_loopでself._pageがnullになる問題の修正
>Fet-Fe (→コード) |
>Fet-Fe (→コード: v4.1.5 _get_tweet_loopでself._pageがnullになる問題の修正) |
||
11行目: | 11行目: | ||
"""Twitter自動収集スクリプト | """Twitter自動収集スクリプト | ||
ver4.1. | ver4.1.5 2023/11/4恒心 | ||
当コードは恒心停止してしまった https://rentry.co/7298g の降臨ショーツイート自動収集スクリプトの復刻改善版です | 当コードは恒心停止してしまった https://rentry.co/7298g の降臨ショーツイート自動収集スクリプトの復刻改善版です | ||
157行目: | 157行目: | ||
HEADERS: Final[dict[str, str]] = { | HEADERS: Final[dict[str, str]] = { | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0' | 'User-Agent': | ||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0' | |||
} | } | ||
"""Final[dict[str, str]]: HTTPリクエスト時のヘッダ。 | """Final[dict[str, str]]: HTTPリクエスト時のヘッダ。 | ||
180行目: | 181行目: | ||
""" | """ | ||
def __init__(self): | def __init__(self) -> None: | ||
"""コンストラクタ。 | """コンストラクタ。 | ||
""" | """ | ||
214行目: | 215行目: | ||
try: | try: | ||
return self._execute(url).text | return self._execute(url).text | ||
except requests.exceptions. | except requests.exceptions.HTTPError as e: | ||
raise AccessError from e | raise AccessError from e | ||
231行目: | 232行目: | ||
try: | try: | ||
res: Final[requests.models.Response] = self._execute(url) | res: Final[requests.models.Response] = self._execute(url) | ||
except requests.exceptions. | except requests.exceptions.HTTPError as e: | ||
raise AccessError from e | raise AccessError from e | ||
268行目: | 269行目: | ||
is_tor = json.loads(res)['IsTor'] | is_tor = json.loads(res)['IsTor'] | ||
if is_tor: | if is_tor: | ||
logger.info('Tor connection OK') | logger.info('Tor browser connection OK') | ||
return self.PROXIES_WITH_BROWSER | return self.PROXIES_WITH_BROWSER | ||
except requests.exceptions.ConnectionError: | except requests.exceptions.ConnectionError: | ||
302行目: | 303行目: | ||
""" | """ | ||
TOR_BROWSER_PATHS: MappingProxyType[str, str] = MappingProxyType({ | TOR_BROWSER_PATHS: Final[MappingProxyType[str, str]] = MappingProxyType({ | ||
'Windows': r'C:\Program Files\Tor Browser\Browser\firefox.exe', | 'Windows': r'C:\Program Files\Tor Browser\Browser\firefox.exe', | ||
'Darwin': '/Applications/Tor Browser.app/Contents/MacOS/firefox', | 'Darwin': '/Applications/Tor Browser.app/Contents/MacOS/firefox', | ||
'Linux': '/usr/bin/torbrowser' | 'Linux': '/usr/bin/torbrowser' | ||
}) | }) | ||
"""MappingProxyType[str, str]: OSごとのTor Browserのパス。 | """Final[MappingProxyType[str, str]]: OSごとのTor Browserのパス。 | ||
""" | """ | ||
318行目: | 319行目: | ||
""" | """ | ||
def __init__(self, enable_javascript: bool): | def __init__(self, enable_javascript: bool) -> None: | ||
"""コンストラクタ。 | """コンストラクタ。 | ||
361行目: | 362行目: | ||
self._driver, | self._driver, | ||
self.WAIT_TIME_FOR_INIT) | self.WAIT_TIME_FOR_INIT) | ||
wait_init.until( | wait_init.until( | ||
ec.element_to_be_clickable((By.ID, 'connectButton')) | ec.element_to_be_clickable((By.ID, 'connectButton')) | ||
) | ) | ||
self._driver.find_element(By.ID, 'connectButton').click() | self._driver.find_element(By.ID, 'connectButton').click() | ||
# Torの接続が完了するまで待つ | # Torの接続が完了するまで待つ | ||
wait_init.until(ec.url_contains('about:blank')) | wait_init.until(ec.url_contains('about:blank')) | ||
except BaseException: | except BaseException: | ||
self.quit() | self.quit() | ||
398行目: | 399行目: | ||
self._driver, | self._driver, | ||
self.WAIT_TIME_FOR_RECAPTCHA | self.WAIT_TIME_FOR_RECAPTCHA | ||
).until( | ).until( | ||
ec.visibility_of_element_located( | ec.visibility_of_element_located( | ||
# bot検知された場合に現れるクラス | # bot検知された場合に現れるクラス | ||
446行目: | 447行目: | ||
""" | """ | ||
def __init__(self, use_browser: bool, enable_javascript: bool): | def __init__(self, use_browser: bool, enable_javascript: bool) -> None: | ||
"""コンストラクタ。 | """コンストラクタ。 | ||
607行目: | 608行目: | ||
""" | """ | ||
def __init__(self, date: datetime): | def __init__(self, date: datetime) -> None: | ||
"""コンストラクタ。 | """コンストラクタ。 | ||
613行目: | 614行目: | ||
date (datetime): 記録するツイートの最新日付。 | date (datetime): 記録するツイートの最新日付。 | ||
""" | """ | ||
self._tables: list[str] = [''] | self._tables: Final[list[str]] = [''] | ||
self._count: int = 0 # 記録数 | self._count: int = 0 # 記録数 | ||
self._date: datetime = date | self._date: datetime = date | ||
748行目: | 749行目: | ||
lambda t: head_space_pattern.sub(' ', t), | lambda t: head_space_pattern.sub(' ', t), | ||
lambda t: head_marks_pattern.sub(r'<nowiki>\1</nowiki>', t), | lambda t: head_marks_pattern.sub(r'<nowiki>\1</nowiki>', t), | ||
lambda t: bar_pattern.sub('<nowiki>----</nowiki>', t) | lambda t: bar_pattern.sub('<nowiki>----</nowiki>', t), | ||
lambda t: escape_nolink_urls(t), | |||
) | ) | ||
escaped_text: str = text | |||
for escape_callable in cls._escape_callables: | for escape_callable in cls._escape_callables: | ||
escaped_text = escape_callable(escaped_text) | |||
return escaped_text | |||
return | |||
@staticmethod | @staticmethod | ||
857行目: | 859行目: | ||
""" | """ | ||
INVIDIOUS_INSTANCES_TUPLE: tuple[str, ...] = ( | INVIDIOUS_INSTANCES_TUPLE: Final[tuple[str, ...]] = ( | ||
'piped.kavin.rocks', | 'piped.kavin.rocks', | ||
'piped.video' | 'piped.video' | ||
) | ) | ||
"""tuple[str, ...]: よく使われるInvidiousインスタンスのリスト。 | """Final[tuple[str, ...]]: よく使われるInvidiousインスタンスのリスト。 | ||
:const:`~INVIDIOUS_INSTANCES_URL` にアクセスしてもインスタンスが取得できないことがあるため、 | :const:`~INVIDIOUS_INSTANCES_URL` にアクセスしてもインスタンスが取得できないことがあるため、 | ||
905行目: | 907行目: | ||
""" | """ | ||
def __init__(self): | def __init__(self) -> None: | ||
"""コンストラクタ。 | """コンストラクタ。 | ||
""" | """ | ||
913行目: | 915行目: | ||
self._img_ext_pattern: Final[Pattern[str]] = re.compile( | self._img_ext_pattern: Final[Pattern[str]] = re.compile( | ||
r'%2F([^%]*\.(?:jpg|jpeg|png|gif))') | r'%2F([^%]*\.(?:jpg|jpeg|png|gif))') | ||
self._url_fragment_pattern: Final[Pattern[str]] = re.compile('#[^#]*$') | self._url_fragment_pattern: Final[Pattern[str]] = re.compile( | ||
r'#[^#]*$') | |||
self._url_query_pattern: Final[Pattern[str]] = re.compile(r'\?.*$') | self._url_query_pattern: Final[Pattern[str]] = re.compile(r'\?.*$') | ||
969行目: | 972行目: | ||
# 日付取得 | # 日付取得 | ||
timeline_item: Tag | NavigableString | None = BeautifulSoup( | timeline_item: Final[Tag | NavigableString | None] = BeautifulSoup( | ||
self._page, 'html.parser').find( | self._page, 'html.parser').find( | ||
class_='timeline-item') | class_='timeline-item') | ||
assert isinstance(timeline_item, Tag) | assert isinstance(timeline_item, Tag) | ||
date: datetime = self._tweet_date(timeline_item) | date: Final[datetime] = self._tweet_date(timeline_item) | ||
self._table_builder: TableBuilder = TableBuilder(date) | self._table_builder: TableBuilder = TableBuilder(date) | ||
1,066行目: | 1,069行目: | ||
logger.critical('Invidiousが死んでますを') | logger.critical('Invidiousが死んでますを') | ||
sys.exit(1) | sys.exit(1) | ||
instance_list: list[str] = [] | instance_list: Final[list[str]] = [] | ||
for instance_info in json.loads(invidious_json): | for instance_info in json.loads(invidious_json): | ||
instance_list.append(instance_info[0]) | instance_list.append(instance_info[0]) | ||
1,225行目: | 1,228行目: | ||
datetime: ツイートの時刻。 | datetime: ツイートの時刻。 | ||
""" | """ | ||
tweet_date: Tag | NavigableString | None = tweet.find( | tweet_date: Final[Tag | NavigableString | None] = tweet.find( | ||
class_='tweet-date') | class_='tweet-date') | ||
assert isinstance(tweet_date, Tag) | assert isinstance(tweet_date, Tag) | ||
tweet_date_a: Tag | None = tweet_date.a | tweet_date_a: Final[Tag | None] = tweet_date.a | ||
assert tweet_date_a is not None | assert tweet_date_a is not None | ||
date_str: str | list[str] | None = tweet_date_a.get('title') | date_str: Final[str | list[str] | None] = tweet_date_a.get('title') | ||
assert isinstance(date_str, str) | assert isinstance(date_str, str) | ||
return datetime.strptime( | return datetime.strptime( | ||
1,258行目: | 1,261行目: | ||
media_txt: str = '' | media_txt: str = '' | ||
if tweet_media is not None: | if tweet_media is not None: | ||
media_list: list[str] = [] | media_list: Final[list[str]] = [] | ||
# ツイートの画像の取得 | # ツイートの画像の取得 | ||
for image_a in tweet_media.select('.attachment.image a'): | for image_a in tweet_media.select('.attachment.image a'): | ||
try: | try: | ||
href: str | list[str] | None = image_a.get('href') | href: Final[str | list[str] | None] = image_a.get('href') | ||
assert isinstance(href, str) | assert isinstance(href, str) | ||
img_matched: Final[Match[str] | None] = ( | |||
self._img_ext_pattern.search(href)) | |||
assert | assert img_matched is not None | ||
media_name: Final[str] = | media_name: Final[str] = img_matched.group(1) | ||
media_list.append(f'[[ファイル:{media_name}|240px]]') | media_list.append(f'[[ファイル:{media_name}|240px]]') | ||
if self._download_media( | if self._download_media( | ||
1,301行目: | 1,304行目: | ||
data_url: Final[str | list[str] | None] = video.get('data-url') | data_url: Final[str | list[str] | None] = video.get('data-url') | ||
assert isinstance(data_url, str) | assert isinstance(data_url, str) | ||
video_matched: Final[Match[str] | None] = re.search( | |||
assert | r'[^/]+$', data_url) | ||
media_path: Final[str] = unquote( | assert video_matched is not None | ||
media_path: Final[str] = unquote(video_matched.group()) | |||
tweet_id: Final[str] = tweet_url.split('/')[-1] | tweet_id: Final[str] = tweet_url.split('/')[-1] | ||
ts_filename: Final[str] = ( | ts_filename: Final[str] = ( | ||
1,349行目: | 1,353行目: | ||
quote_txt: str = '' | quote_txt: str = '' | ||
if tweet_quote is not None: | if tweet_quote is not None: | ||
quote_link: Tag | None = tweet_quote.select_one('.quote-link') | quote_link: Final[Tag | None] = ( | ||
tweet_quote.select_one('.quote-link')) | |||
assert quote_link is not None | assert quote_link is not None | ||
link_href: Final[str | list[str] | None] = quote_link.get('href') | |||
assert isinstance( | assert isinstance(link_href, str) | ||
link = self._url_fragment_pattern.sub('', | link: str = self._url_fragment_pattern.sub('', link_href) | ||
link = urljoin(self.TWITTER_URL, link) | link = urljoin(self.TWITTER_URL, link) | ||
quote_txt = self._archive_url(link, accessor) | quote_txt = self._archive_url(link, accessor) | ||
1,376行目: | 1,381行目: | ||
poll_meters: Final[ResultSet[Tag]] = tweet_poll.select( | poll_meters: Final[ResultSet[Tag]] = tweet_poll.select( | ||
'.poll-meter') | '.poll-meter') | ||
poll_info: Tag | None = tweet_poll.select_one('.poll-info') | poll_info: Final[Tag | None] = tweet_poll.select_one('.poll-info') | ||
assert poll_info is not None | assert poll_info is not None | ||
for poll_meter in poll_meters: | for poll_meter in poll_meters: | ||
poll_choice_value: Tag | None = poll_meter.select_one( | poll_choice_value: Final[Tag | None] = poll_meter.select_one( | ||
'.poll-choice-value') | '.poll-choice-value') | ||
assert poll_choice_value is not None | assert poll_choice_value is not None | ||
ratio: str = poll_choice_value.text | ratio: Final[str] = poll_choice_value.text | ||
poll_choice_option: Tag | None = poll_meter.select_one( | poll_choice_option: Final[Tag | None] = poll_meter.select_one( | ||
'.poll-choice-option') | '.poll-choice-option') | ||
assert poll_choice_option is not None | assert poll_choice_option is not None | ||
1,420行目: | 1,425行目: | ||
list[Tag]: ツイートのアイテムである ``.timeline-item`` タグを表すTagオブジェクトのリスト。 | list[Tag]: ツイートのアイテムである ``.timeline-item`` タグを表すTagオブジェクトのリスト。 | ||
""" | """ | ||
timeline_item_list: list[Tag] = [] | timeline_item_list: Final[list[Tag]] = [] | ||
for item_or_list in soup.select( | for item_or_list in soup.select( | ||
'.timeline > .timeline-item, .timeline > .thread-line'): | '.timeline > .timeline-item, .timeline > .thread-line'): | ||
1,447行目: | 1,452行目: | ||
urls_in_tweet: Final[ResultSet[Tag]] = tag.find_all('a') | urls_in_tweet: Final[ResultSet[Tag]] = tag.find_all('a') | ||
for url in urls_in_tweet: | for url in urls_in_tweet: | ||
href: str | list[str] | None = url.get('href') | href: Final[str | list[str] | None] = url.get('href') | ||
assert isinstance(href, str) | assert isinstance(href, str) | ||
1,467行目: | 1,472行目: | ||
and self._invidious_pattern.search(href)): | and self._invidious_pattern.search(href)): | ||
# Nitter上のYouTubeへのリンクをInvidiousのものから直す | # Nitter上のYouTubeへのリンクをInvidiousのものから直す | ||
if re.match( | invidious_href: Final[str | list[str] | None] = ( | ||
self._invidious_pattern.sub( | |||
'youtube.com' if ( | |||
re.match(r'https://[^/]+/[^/]+/', href) | |||
or re.search(r'/@[^/]*$', href) | |||
) else 'youtu.be', | |||
href)) | |||
url.replace_with(self._archive_url( | |||
url.replace_with(self._archive_url( | invidious_href, accessor)) | ||
elif href.startswith('https://bibliogram.art/'): | elif href.startswith('https://bibliogram.art/'): | ||
# Nitter上のInstagramへのリンクをBibliogramのものから直す | # Nitter上のInstagramへのリンクをBibliogramのものから直す | ||
1,487行目: | 1,492行目: | ||
elif url.text.startswith('@'): | elif url.text.startswith('@'): | ||
url_link: str = urljoin(self.TWITTER_URL, href) | url_link: str = urljoin(self.TWITTER_URL, href) | ||
url_text: str = url.text | url_text: Final[str] = url.text | ||
url.replace_with( | url.replace_with( | ||
self._archive_url( | self._archive_url( | ||
1,554行目: | 1,559行目: | ||
else: | else: | ||
soup: Final[BeautifulSoup] = BeautifulSoup(res, 'html.parser') | soup: Final[BeautifulSoup] = BeautifulSoup(res, 'html.parser') | ||
content: Tag | NavigableString | None = soup.find( | content: Final[Tag | NavigableString | None] = soup.find( | ||
id='CONTENT') # archive.todayの魚拓一覧ページの中身だけ取得 | id='CONTENT') # archive.todayの魚拓一覧ページの中身だけ取得 | ||
if (content is None or content.get_text()[:len(self.NO_ARCHIVE)] | if (content is None or content.get_text()[:len(self.NO_ARCHIVE)] | ||
1,561行目: | 1,566行目: | ||
else: | else: | ||
assert isinstance(content, Tag) | assert isinstance(content, Tag) | ||
content_a: Tag | NavigableString | None = content.find('a') | content_a: Final[Tag | NavigableString | None] = content.find( | ||
'a') | |||
assert isinstance(content_a, Tag) | assert isinstance(content_a, Tag) | ||
href: str | list[str] | None = content_a.get('href') | href: Final[str | list[str] | None] = content_a.get('href') | ||
assert isinstance(href, str) | assert isinstance(href, str) | ||
archive_url = href.replace( | archive_url = href.replace( | ||
1,585行目: | 1,591行目: | ||
tweets: Final[list[Tag]] = self._get_timeline_items(soup) | tweets: Final[list[Tag]] = self._get_timeline_items(soup) | ||
for tweet in tweets: | for tweet in tweets: | ||
tweet_a: Tag | None = tweet.a | tweet_a: Final[Tag | None] = tweet.a | ||
assert tweet_a is not None | assert tweet_a is not None | ||
if tweet_a.text == self.NEWEST: | if tweet_a.text == self.NEWEST: | ||
1,610行目: | 1,616行目: | ||
self._table_builder.next_day_if_necessary(date) | self._table_builder.next_day_if_necessary(date) | ||
tweet_link: Tag | NavigableString | None = tweet.find( | tweet_link: Final[Tag | NavigableString | None] = tweet.find( | ||
class_='tweet-link') | class_='tweet-link') | ||
assert isinstance(tweet_link, Tag) | assert isinstance(tweet_link, Tag) | ||
href: str | list[str] | None = tweet_link.get('href') | href: Final[str | list[str] | None] = tweet_link.get('href') | ||
assert isinstance(href, str) | assert isinstance(href, str) | ||
tweet_url: Final[str] = urljoin( | tweet_url: Final[str] = urljoin( | ||
1,621行目: | 1,627行目: | ||
tweet_callinshow_template: Final[str] = self._callinshowlink_url( | tweet_callinshow_template: Final[str] = self._callinshowlink_url( | ||
tweet_url, accessor) | tweet_url, accessor) | ||
tweet_content: Tag | NavigableString | None = tweet.find( | tweet_content: Final[Tag | NavigableString | None] = tweet.find( | ||
class_='tweet-content media-body') | class_='tweet-content media-body') | ||
assert isinstance(tweet_content, Tag) | assert isinstance(tweet_content, Tag) | ||
1,670行目: | 1,676行目: | ||
for show_more in show_mores: # show-moreに次ページへのリンクか前ページへのリンクがある | for show_more in show_mores: # show-moreに次ページへのリンクか前ページへのリンクがある | ||
if show_more.text != self.NEWEST: # 前ページへのリンクではないか判定 | if show_more.text != self.NEWEST: # 前ページへのリンクではないか判定 | ||
show_more_a: Tag | None = show_more.a | show_more_a: Final[Tag | None] = show_more.a | ||
assert show_more_a is not None | assert show_more_a is not None | ||
href: str | list[str] | None = show_more_a.get('href') | href: Final[str | list[str] | None] = show_more_a.get('href') | ||
assert isinstance(href, str) | assert isinstance(href, str) | ||
new_url = urljoin( | new_url = urljoin( | ||
1,747行目: | 1,753行目: | ||
""" | """ | ||
TWEET_URL_PREFIX_DEFAULT: Final[str] = ' | TWEET_URL_PREFIX_DEFAULT: Final[str] = '17207' | ||
"""Final[str]: ツイートURLの数字部分のうち、予め固定しておく部分。 | """Final[str]: ツイートURLの数字部分のうち、予め固定しておく部分。 | ||
1,754行目: | 1,760行目: | ||
""" | """ | ||
INCREMENTED_NUM_DEFAULT: Final[int] = | INCREMENTED_NUM_DEFAULT: Final[int] = 4 | ||
"""Final[int]: ツイートURLの数字部分うち、インクリメントする桁のデフォルト値。 | """Final[int]: ツイートURLの数字部分うち、インクリメントする桁のデフォルト値。 | ||
1,817行目: | 1,823行目: | ||
str: タグの属性値。 | str: タグの属性値。 | ||
""" | """ | ||
result: str | list[str] | None = tag.get(key) | result: Final[str | list[str] | None] = tag.get(key) | ||
assert isinstance(result, str) | assert isinstance(result, str) | ||
return result | return result | ||
1,853行目: | 1,859行目: | ||
'#CONTENT > div > .TEXT-BLOCK') | '#CONTENT > div > .TEXT-BLOCK') | ||
for tweet in tweets: | for tweet in tweets: | ||
a_last_child: Tag | None = tweet.select_one('a:last-child') | a_last_child: Final[Tag | None] = tweet.select_one('a:last-child') | ||
assert a_last_child is not None | assert a_last_child is not None | ||
url_matched: Final[Match[str]] | None = ( | url_matched: Final[Match[str]] | None = ( | ||
1,859行目: | 1,865行目: | ||
) | ) | ||
if url_matched is not None: | if url_matched is not None: | ||
a_first_child: Tag | None = tweet.select_one('a:first-child') | a_first_child: Final[Tag | None] = tweet.select_one( | ||
'a:first-child') | |||
assert a_first_child is not None | assert a_first_child is not None | ||
archive_url: str | list[str] | None = a_first_child.get('href') | archive_url: Final[str | list[str] | None] = a_first_child.get( | ||
'href') | |||
assert isinstance(archive_url, str) | assert isinstance(archive_url, str) | ||
if url_matched[0] not in self._url_list_on_wiki: | if url_matched[0] not in self._url_list_on_wiki: | ||
1,869行目: | 1,877行目: | ||
self._url_list.sort(reverse=True, key=lambda x: x.url) # 降順 | self._url_list.sort(reverse=True, key=lambda x: x.url) # 降順 | ||
def | def _fetch_next_page( | ||
self, | self, | ||
soup: BeautifulSoup, | soup: BeautifulSoup, | ||
accessor: AccessorHandler) -> | accessor: AccessorHandler) -> str | None: | ||
"""archive.todayの検索結果のページをpaginateする。 | """archive.todayの検索結果のページをpaginateする。 | ||
1,880行目: | 1,888行目: | ||
Returns: | Returns: | ||
str | None: 次のページがあればそのHTML。 | |||
""" | """ | ||
next_a: Tag | None = soup.select_one('#next') | next_a: Final[Tag | None] = soup.select_one('#next') | ||
if next_a is not None: | if next_a is not None: | ||
link: str | list[str] | None = next_a.get('href') | link: Final[str | list[str] | None] = next_a.get('href') | ||
assert isinstance(link, str) | assert isinstance(link, str) | ||
page: Final[str | None] = accessor.request(link) | page: Final[str | None] = accessor.request(link) | ||
assert page is not None | assert page is not None | ||
return page | |||
else: | else: | ||
return | return | ||
def _get_tweet_loop( | def _get_tweet_loop( | ||
1,906行目: | 1,913行目: | ||
while has_next: | while has_next: | ||
self._append_tweet_urls(soup) | self._append_tweet_urls(soup) | ||
next_page: Final[str | None] = self._fetch_next_page( | |||
soup = BeautifulSoup( | soup, accessor) | ||
if next_page is not None: | |||
soup = BeautifulSoup(next_page, 'html.parser') | |||
else: | |||
has_next = False | |||
def _next_url( | def _next_url( | ||
1,936行目: | 1,947行目: | ||
self._next_url(accessor, '16', 5) | self._next_url(accessor, '16', 5) | ||
""" | """ | ||
assert 0 <= incremented_num and incremented_num <= 9, \ | |||
f'incremented_numが{incremented_num}でふ' | |||
logger.info(self.TWITTER_URL + self._name + '/status/' | logger.info(self.TWITTER_URL + self._name + '/status/' | ||
+ tweet_url_prefix + str(incremented_num) + '*を探索中') | + tweet_url_prefix + str(incremented_num) + '*を探索中') | ||
page: Final[str | None] = accessor.request( | page: Final[str | None] = accessor.request( | ||
self.ARCHIVE_TODAY | self.ARCHIVE_TODAY | ||
1,947行目: | 1,961行目: | ||
assert page is not None | assert page is not None | ||
soup: Final[BeautifulSoup] = BeautifulSoup(page, 'html.parser') | soup: Final[BeautifulSoup] = BeautifulSoup(page, 'html.parser') | ||
pager: Tag | None = soup.select_one('#pager') | |||
pager: Final[Tag | None] = soup.select_one('#pager') | |||
if pager is not None: # 検索結果が複数ページ | if pager is not None: # 検索結果が複数ページ | ||
page_num_matched: Final[Match[str] | None] = re.search( | page_num_matched: Final[Match[str] | None] = re.search( | ||
1,981行目: | 1,996行目: | ||
datetime_tag.get('datetime')) | datetime_tag.get('datetime')) | ||
assert isinstance(datetime_str, str) | assert isinstance(datetime_str, str) | ||
raw_time: datetime = datetime.strptime( | raw_time: Final[datetime] = datetime.strptime( | ||
datetime_str, | datetime_str, '%Y-%m-%dT%H:%M:%SZ') | ||
return raw_time.replace(tzinfo=ZoneInfo('Asia/Tokyo')) | return raw_time.replace(tzinfo=ZoneInfo('Asia/Tokyo')) | ||
1,999行目: | 2,013行目: | ||
list[UrlTuple]: リツイートを除いたURLのリスト。 | list[UrlTuple]: リツイートを除いたURLのリスト。 | ||
""" | """ | ||
filtered_urls: list[UrlTuple] = [] | filtered_urls: Final[list[UrlTuple]] = [] | ||
for url_pair in url_pairs: | for url_pair in url_pairs: | ||
page: str | None = accessor.request(url_pair.archive_url) | page: Final[str | None] = accessor.request(url_pair.archive_url) | ||
assert page is not None | assert page is not None | ||
soup: BeautifulSoup = BeautifulSoup(page, 'html.parser') | soup: Final[BeautifulSoup] = BeautifulSoup(page, 'html.parser') | ||
if soup.select_one('span[data-testid="socialContext"]') is None: | if soup.select_one('span[data-testid="socialContext"]') is None: | ||
filtered_urls.append(url_pair) | filtered_urls.append(url_pair) | ||
2,030行目: | 2,044行目: | ||
# リツイートを除く | # リツイートを除く | ||
filtered_url_list: list[UrlTuple] = self._filter_out_retweets( | filtered_url_list: Final[list[UrlTuple]] = ( | ||
self._filter_out_retweets(self._url_list, accessor)) | |||
with codecs.open(self.FILENAME, 'w', 'utf-8') as f: | with codecs.open(self.FILENAME, 'w', 'utf-8') as f: |