一个多月前
工作需要
用GPT手搓了
采集插件
仅支持“标题+链接+单页采集”
满足场景单一
今天换了国产模型豆包
试了几轮
终于能
一键采集全部历史发文
采集数据范围包括
标题、链接、阅读数、点赞数、分享数、推荐数
安装油猴脚本+以下代码即可实现
油猴安装图文教程(教程)
// ==UserScript==
// @name 微信公众号跨页采集工具
// @namespace http://tampermonkey.net/
// @version 1.5.0
// @description 通过localStorage和URL参数实现跨页面采集
// @author doubao
// @match https://mp.weixin.qq.com/cgi-bin/appmsgpublish?sub=list*
// @grant GM_addStyle
// @grant GM_download
// @require https://cdn.jsdelivr.net/npm/xlsx@0.18.5/dist/xlsx.full.min.js
// @require https://cdn.jsdelivr.net/npm/file-saver@2.0.5/dist/FileSaver.min.js
// ==/UserScript==
(function() {
'use strict';
// 配置参数
const TOTAL_PAGES = 11;
const ARTICLES_PER_PAGE = 10;
const STORAGE_KEY = 'wechat_articles_data';
const PARAM_KEY = 'collect_page';
// 添加样式
GM_addStyle(`
#wechat-data-collector {
position: fixed;
top: 20px;
right: 20px;
padding: 10px 20px;
background-color: #4CAF50;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
z-index: 9999;
}
#wechat-data-progress {
position: fixed;
top: 70px;
right: 20px;
background-color: #fff;
border: 1px solid #ddd;
padding: 8px 12px;
border-radius: 4px;
z-index: 9998;
display: none;
}
`);
// 从URL获取当前采集页
function getCurrentPageFromUrl() {
const params = new URLSearchParams(window.location.search);
const page = params.get(PARAM_KEY);
return page ? parseInt(page) : null;
}
// 从localStorage获取已采集数据
function loadSavedData() {
const savedData = localStorage.getItem(STORAGE_KEY);
return savedData ? JSON.parse(savedData) : [];
}
// 保存数据到localStorage
function saveData(data) {
localStorage.setItem(STORAGE_KEY, JSON.stringify(data));
}
// 清除保存的数据
function clearSavedData() {
localStorage.removeItem(STORAGE_KEY);
}
// 采集当前页数据
function collectCurrentPage() {
const articles = [];
const articleElements = document.querySelectorAll('.weui-desktop-mass-appmsg__bd');
console.log(`开始采集当前页,找到 ${articleElements.length} 篇文章`);
articleElements.forEach(element => {
try {
const titleElement = element.querySelector('.weui-desktop-mass-appmsg__title');
const link = titleElement?.href || '';
const title = titleElement?.textContent.trim() || '无标题';
// 提取时间
const timeMatch = link.match(/send_time=(\d+)/);
const publishTime = timeMatch ?
new Date(parseInt(timeMatch[1]) * 1000).toLocaleString() : '未知时间';
// 提取统计数据
const stats = {
readCount: element.querySelector('.appmsg-view .weui-desktop-mass-media__data__inner')?.textContent.trim() || '0',
likeCount: element.querySelector('.appmsg-like .weui-desktop-mass-media__data__inner')?.textContent.trim() || '0',
shareCount: element.querySelector('.appmsg-share .weui-desktop-mass-media__data__inner')?.textContent.trim() || '0',
recommendCount: element.querySelector('.appmsg-haokan .weui-desktop-mass-media__data__inner')?.textContent.trim() || '0'
};
articles.push({
标题: title,
链接: link,
发文时间: publishTime,
阅读人数: stats.readCount,
点赞人数: stats.likeCount,
分享人数: stats.shareCount,
推荐人数: stats.recommendCount
});
} catch (error) {
console.error('解析文章数据时出错:', error);
}
});
return articles;
}
// 导航到下一页
function navigateToNextPage(currentPage) {
const nextPage = currentPage + 1;
if (nextPage > TOTAL_PAGES) {
return false; // 已到达最后一页
}
const nextBegin = (nextPage - 1) * ARTICLES_PER_PAGE;
const baseUrl = window.location.origin + window.location.pathname;
const params = new URLSearchParams(window.location.search);
params.set(PARAM_KEY, nextPage);
params.set('begin', nextBegin);
// 跳转至下一页并携带采集参数
window.location.href = `${baseUrl}?${params.toString()}`;
return true;
}
// 导出为XLSX
function exportToExcel(data) {
const ws = XLSX.utils.json_to_sheet(data);
const wb = XLSX.utils.book_new();
XLSX.utils.book_append_sheet(wb, ws, '公众号数据');
const wbout = XLSX.write(wb, { bookType: 'xlsx', type: 'array' });
const blob = new Blob([wbout], { type: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' });
const fileName = `公众号数据_${new Date().toISOString().replace(/[:T]/g, '-')}.xlsx`;
saveAs(blob, fileName);
}
// 创建采集按钮
function createCollectorButton() {
const button = document.createElement('button');
button.id = 'wechat-data-collector';
button.textContent = '开始采集全部数据';
document.body.appendChild(button);
const progress = document.createElement('div');
progress.id = 'wechat-data-progress';
document.body.appendChild(progress);
button.addEventListener('click', () => {
// 清除之前保存的数据
clearSavedData();
// 开始从第一页采集
const params = new URLSearchParams(window.location.search);
params.set(PARAM_KEY, 1);
params.set('begin', 0);
window.location.href = `${window.location.origin}${window.location.pathname}?${params.toString()}`;
});
}
// 页面加载后初始化
window.addEventListener('load', () => {
if (!window.location.href.includes('appmsgpublish?sub=list')) {
return;
}
const currentPage = getCurrentPageFromUrl();
// 创建进度显示元素
const progress = document.createElement('div');
progress.id = 'wechat-data-progress';
document.body.appendChild(progress);
if (currentPage) {
// 正在采集流程中
progress.style.display = 'block';
progress.textContent = `正在采集第 ${currentPage}/${TOTAL_PAGES} 页...`;
// 检查是否登录超时
if (document.title.includes('登录超时')) {
progress.textContent = '登录超时,请重新扫码登录后再次点击采集按钮';
return;
}
try {
// 采集当前页数据
const articles = collectCurrentPage();
// 加载已保存的数据并添加当前页数据
const allData = loadSavedData();
const updatedData = allData.concat(articles);
saveData(updatedData);
console.log(`第 ${currentPage} 页采集完成,共 ${articles.length} 篇文章,累计 ${updatedData.length} 篇`);
progress.textContent = `第 ${currentPage}/${TOTAL_PAGES} 页采集完成,共 ${articles.length} 篇`;
// 导航到下一页或导出数据
if (currentPage < TOTAL_PAGES) {
setTimeout(() => {
navigateToNextPage(currentPage);
}, 1500); // 等待1.5秒确保UI更新
} else {
// 所有页采集完成,导出数据
setTimeout(() => {
exportToExcel(updatedData);
progress.textContent = `全部 ${TOTAL_PAGES} 页采集完成,共 ${updatedData.length} 篇文章`;
clearSavedData(); // 导出后清除数据
}, 1000);
}
} catch (error) {
progress.textContent = `采集失败: ${error.message}`;
console.error('采集过程中出错:', error);
}
} else {
// 初始状态,创建采集按钮
createCollectorButton();
}
});
})();