→コード: v4.3.7 ツイートの魚拓のhtmlの構造変更に対応
>Fet-Fe (→コード: v4.3.6 魚拓からの取得の仕組み変更) |
>Fet-Fe (→コード: v4.3.7 ツイートの魚拓のhtmlの構造変更に対応) |
||
11行目: | 11行目: | ||
"""Twitter自動収集スクリプト | """Twitter自動収集スクリプト | ||
ver4.3. | ver4.3.7 2024/6/8恒心 | ||
当コードは恒心停止してしまった https://rentry.co/7298g の降臨ショーツイート自動収集スクリプトの復刻改善版です。 | 当コードは恒心停止してしまった https://rentry.co/7298g の降臨ショーツイート自動収集スクリプトの復刻改善版です。 | ||
76行目: | 76行目: | ||
from datetime import datetime | from datetime import datetime | ||
from enum import Enum | from enum import Enum | ||
from time import sleep | from time import sleep | ||
from traceback import TracebackException | from traceback import TracebackException | ||
482行目: | 481行目: | ||
print('reCAPTCHAを解いてね(笑)、それはできるよね。\a\a\a') | print('reCAPTCHAを解いてね(笑)、それはできるよね。\a\a\a') | ||
print('botバレしたら自動でブラウザが再起動するナリよ') | print('botバレしたら自動でブラウザが再起動するナリよ') | ||
print('Tips: カーソルを迷ったように動かすとか、人間らしく振る舞うのがコツナリ') | |||
WebDriverWait(self._driver, self.WEB_DRIVER_WAIT_TIME).until( | WebDriverWait(self._driver, self.WEB_DRIVER_WAIT_TIME).until( | ||
ec.presence_of_element_located( | ec.presence_of_element_located( | ||
873行目: | 873行目: | ||
if not hasattr(cls, '_escape_callables'): | if not hasattr(cls, '_escape_callables'): | ||
# 初回呼び出しの時だけ正規表現をコンパイルする | # 初回呼び出しの時だけ正規表現をコンパイルする | ||
head_space_pattern: Final[Pattern[str]] = re.compile( | head_space_pattern: Final[re.Pattern[str]] = re.compile( | ||
r'^ ', re.MULTILINE) | r'^ ', re.MULTILINE) | ||
head_marks_pattern: Final[Pattern[str]] = re.compile( | head_marks_pattern: Final[re.Pattern[str]] = re.compile( | ||
r'^([\*#:;])', re.MULTILINE) | r'^([\*#:;])', re.MULTILINE) | ||
bar_pattern: Final[Pattern[str]] = re.compile( | bar_pattern: Final[re.Pattern[str]] = re.compile( | ||
r'^----', re.MULTILINE) | r'^----', re.MULTILINE) | ||
1,058行目: | 1,058行目: | ||
self._has_ffmpeg: Final[bool] = self._check_ffmpeg() # ffmpegがあるかチェック | self._has_ffmpeg: Final[bool] = self._check_ffmpeg() # ffmpegがあるかチェック | ||
self._img_ext_pattern: Final[Pattern[str]] = re.compile( | self._img_ext_pattern: Final[re.Pattern[str]] = re.compile( | ||
r'%2F([^%]*\.(?:jpg|jpeg|png|gif))') | r'%2F([^%]*\.(?:jpg|jpeg|png|gif))') | ||
self._url_fragment_pattern: Final[Pattern[str]] = re.compile( | self._url_fragment_pattern: Final[re.Pattern[str]] = re.compile( | ||
r'#[^#]*$') | r'#[^#]*$') | ||
self._url_query_pattern: Final[Pattern[str]] = re.compile(r'\?.*$') | self._url_query_pattern: Final[re.Pattern[str]] = re.compile(r'\?.*$') | ||
def _set_queries(self, accessor: AccessorHandler, krsw: bool) -> bool: | def _set_queries(self, accessor: AccessorHandler, krsw: bool) -> bool: | ||
1,419行目: | 1,419行目: | ||
href: Final[str | list[str] | None] = image_a.get('href') | href: Final[str | list[str] | None] = image_a.get('href') | ||
assert isinstance(href, str) | assert isinstance(href, str) | ||
img_matched: Final[Match[str] | None] = ( | img_matched: Final[re.Match[str] | None] = ( | ||
self._img_ext_pattern.search(href)) | self._img_ext_pattern.search(href)) | ||
assert img_matched is not None | assert img_matched is not None | ||
1,456行目: | 1,456行目: | ||
data_url: Final[str | list[str] | None] = video.get('data-url') | data_url: Final[str | list[str] | None] = video.get('data-url') | ||
assert isinstance(data_url, str) | assert isinstance(data_url, str) | ||
video_matched: Final[Match[str] | None] = re.search( | video_matched: Final[re.Match[str] | None] = re.search( | ||
r'[^/]+$', data_url) | r'[^/]+$', data_url) | ||
assert video_matched is not None | assert video_matched is not None | ||
1,711行目: | 1,711行目: | ||
assert isinstance(content, Tag) | assert isinstance(content, Tag) | ||
content_a: Final[Tag | NavigableString | None] = ( | content_a: Final[Tag | NavigableString | None] = ( | ||
content.select_one('.TEXT-BLOCK > a')) | content.select_one('.TEXT-BLOCK > a')) # 最新の魚拓を取得 | ||
assert isinstance(content_a, Tag) | assert isinstance(content_a, Tag) | ||
href: Final[str | list[str] | None] = content_a.get('href') | href: Final[str | list[str] | None] = content_a.get('href') | ||
1,890行目: | 1,890行目: | ||
self._invidious_instances(accessor) | self._invidious_instances(accessor) | ||
) | ) | ||
self._invidious_pattern: Pattern[str] = re.compile( | self._invidious_pattern: re.Pattern[str] = re.compile( | ||
'|'.join(invidious_url_tuple)) | '|'.join(invidious_url_tuple)) | ||
1,982行目: | 1,982行目: | ||
+ ', 最古のURL: ' + self._oldest_url + '*' + 'で検索しまふ' | + ', 最古のURL: ' + self._oldest_url + '*' + 'で検索しまふ' | ||
) | ) | ||
self._twitter_url_pattern: Pattern[str] = re.compile( | self._twitter_url_pattern: re.Pattern[str] = re.compile( | ||
'^' + self.TWITTER_URL + self._name + r'/status/\d+') | '^' + self.TWITTER_URL + self._name + r'/status/\d+') | ||
self._archive_rt_pattern: Pattern[str] = re.compile( | self._archive_rt_pattern: re.Pattern[str] = re.compile( | ||
r'on (?:Twitter|X): "RT @\w+:.+"(?:$| / Twitter$| / X$)') | r'on (?:Twitter|X): "RT @\w+:.+"(?:$| / Twitter$| / X$)') | ||
2,074行目: | 2,074行目: | ||
continue | continue | ||
url_matched: Final[Match[str]] | None = next( | url_matched: Final[re.Match[str]] | None = next( | ||
filter( | filter( | ||
lambda x: x is not None, | lambda x: x is not None, | ||
2,186行目: | 2,186行目: | ||
pager: Final[Tag | None] = soup.select_one('#pager') | pager: Final[Tag | None] = soup.select_one('#pager') | ||
if pager is not None: # 検索結果が複数ページ | if pager is not None: # 検索結果が複数ページ | ||
page_num_matched: Final[Match[str] | None] = re.search( | page_num_matched: Final[re.Match[str] | None] = re.search( | ||
r'of (\d+) urls', pager.text) | r'of (\d+) urls', pager.text) | ||
assert page_num_matched is not None | assert page_num_matched is not None | ||
2,290行目: | 2,290行目: | ||
# 通常のリンク | # 通常のリンク | ||
a_tags: Final[ResultSet[Tag]] = tag.select( | a_tags: Final[ResultSet[Tag]] = tag.select( | ||
'div[ | 'div[dir="auto"] > a:not(' | ||
'div[role="link"] div[ | 'div[role="link"] div[dir="auto"] > a)') | ||
for a_tag in a_tags: | for a_tag in a_tags: | ||
a_tag.replace_with( | a_tag.replace_with( | ||
2,443行目: | 2,443行目: | ||
account_name_tag: Final[Tag | None] = ( | account_name_tag: Final[Tag | None] = ( | ||
retweet_tag.select_one( | retweet_tag.select_one( | ||
'div > span:not(:has(> *))')) # noqa: E501 | 'div[tabindex="-1"] > div > span:not(:has(> *))')) # noqa: E501 | ||
assert account_name_tag is not None | assert account_name_tag is not None | ||
text = self._concat_texts( | text = self._concat_texts( |