Linkedin está sobrecargado y es extremadamente lento, acá podés usar estos 2 scripts para evitar ingresar a la página. Borré mis cookies vas a tener que extraer las tuyas. El resultado es esto: https://imlauer.github.io/george_hotz_linkedin.html
Cortesía de OpenCode.
linkedin.js te genera el jsonconst puppeteer = require('puppeteer');
const fs = require('fs');
const cookies = [
{ domain: '.www.linkedin.com', name: 'li_at', value: '', path: '/', secure: true, httpOnly: false },
{ domain: '.www.linkedin.com', name: 'li_rm', value: '', path: '/', secure: true, httpOnly: false },
];
const urls = [
'https://www.linkedin.com/in/george-hotz-b3866476/recent-activity/posts/',
'https://www.linkedin.com/in/george-hotz-b3866476/recent-activity/',
];
async function scrapePage(page, url) {
console.log(`\n=== Loading ${url} ===`);
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120000 });
await new Promise(r => setTimeout(r, 15000));
let scrollCount = 0;
let lastPostCount = 0;
let noChangeCount = 0;
while (noChangeCount < 15 && scrollCount < 150) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await new Promise(r => setTimeout(r, 2500));
const postCount = await page.evaluate(() => {
return document.querySelectorAll('article, .feed-shared-update-v2').length;
});
console.log(`Scroll ${scrollCount} - Posts: ${postCount}`);
if (postCount > lastPostCount) {
noChangeCount = 0;
lastPostCount = postCount;
} else {
noChangeCount++;
await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button'));
for (const btn of buttons) {
const text = btn.innerText || btn.textContent || '';
if (text.includes('Show') || text.includes('See more')) {
try { btn.click(); } catch(e) {}
}
}
});
await new Promise(r => setTimeout(r, 1500));
}
scrollCount++;
}
const posts = await page.evaluate(() => {
const results = [];
document.querySelectorAll('article, .feed-shared-update-v2').forEach(article => {
const text = (article.innerText || article.textContent || '').trim();
if (text.length > 100 && text.length < 20000) {
results.push(text);
}
});
return results;
});
return posts;
}
async function main() {
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled']
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
});
await page.setCookie(...cookies);
let allPosts = [];
for (const url of urls) {
try {
const posts = await scrapePage(page, url);
console.log(`Found ${posts.length} posts from this page`);
allPosts = [...allPosts, ...posts];
} catch(e) {
console.log(`Error loading ${url}: ${e.message}`);
}
}
const uniquePosts = [...new Set(allPosts)];
console.log(`\n=== Total unique posts: ${uniquePosts.length} ===\n`);
fs.writeFileSync('/home/esotericwarfare/opencode/all_posts_complete.json', JSON.stringify(uniquePosts, null, 2));
console.log('Saved to all_posts_complete.json');
console.log('\n--- All posts ---\n');
uniquePosts.forEach((p, i) => {
console.log(`=== POST ${i+1} ===`);
console.log(p.substring(0, 600));
console.log('\n---\n');
});
await browser.close();
}
main().catch(console.error);
generate_html.py genera el html a partir del JSON.import json
with open('/home/esotericwarfare/opencode/all_posts_complete.json', 'r') as f:
posts = json.load(f)
html = '''<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>George Hotz - LinkedIn Posts</title>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: #f3f2ef; color: #191919; padding: 20px; }
.container { max-width: 700px; margin: 0 auto; }
header { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
header h1 { font-size: 24px; color: #0a66c2; margin-bottom: 5px; }
header .title { font-size: 14px; color: #666; }
header .followers { font-size: 12px; color: #666; margin-top: 8px; }
.post { background: white; border-radius: 8px; padding: 16px; margin-bottom: 12px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
.post-header { display: flex; align-items: center; margin-bottom: 12px; }
.post-avatar { width: 48px; height: 48px; border-radius: 50%; background: linear-gradient(135deg, #667eea, #764ba2); display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 18px; margin-right: 12px; }
.post-info { flex: 1; }
.post-name { font-weight: 600; font-size: 14px; }
.post-role { font-size: 12px; color: #666; }
.post-time { font-size: 12px; color: #666; }
.post-content { font-size: 14px; white-space: pre-wrap; word-wrap: break-word; }
.post-stats { display: flex; gap: 15px; margin-top: 12px; padding-top: 12px; border-top: 1px solid #eee; font-size: 12px; color: #666; }
.visibility { font-size: 12px; color: #666; margin-top: 8px; }
.repost-label { background: #eef3f8; color: #0a66c2; padding: 12px; border-radius: 4px; margin-bottom: 12px; }
.image-placeholder { background: #f3f2ef; border-radius: 8px; padding: 40px; text-align: center; color: #666; margin: 12px 0; }
.footer { text-align: center; padding: 40px; color: #666; }
</style>
</head>
<body>
<div class="container">
<header>
<h1>George Hotz</h1>
<div class="title">President at comma.ai (we're hiring, comma.ai/jobs)</div>
<div class="title">San Diego, California, United States</div>
<div class="followers">56,325 followers</div>
</header>
<div id="posts">'''
def parse_post(text):
lines = text.split('\n')
post = {'content': '', 'time': '', 'visibility': '', 'is_repost': 'reposted this' in text, 'has_image': 'Activate to view larger image' in text, 'edited': 'Edited' in text}
content_lines = []
for line in lines:
if 'ago' in line and ('hour' in line or 'day' in line or 'week' in line or 'month' in line or 'year' in line):
post['time'] = line.strip()
elif 'Visible to' in line:
post['visibility'] = line.strip()
elif 'ago' not in line and 'Feed post' not in line and 'Following' not in line and 'President at' not in line and line.strip() and not line.startswith('•'):
if not line.replace('.','').isdigit():
content_lines.append(line)
post['content'] = '\n'.join(content_lines).strip()
return post
for i, post_text in enumerate(posts):
p = parse_post(post_text)
time_display = p['time'] if p['time'] else 'some time ago'
edited_display = ' · Edited' if p['edited'] else ''
html += f'''
<div class="post">
<div class="post-header">
<div class="post-avatar">GH</div>
<div class="post-info">
<div class="post-name">George Hotz</div>
<div class="post-role">President at comma.ai</div>
<div class="post-time">{time_display}{edited_display}</div>
</div>
</div>'''
if p['is_repost']:
html += '<div class="repost-label"><strong>George Hotz reposted this</strong></div>'
content = p['content'].replace('&', '&').replace('<', '<').replace('>', '>')
html += f'<div class="post-content">{content}</div>'
if p['has_image']:
html += '<div class="image-placeholder">🖼️ Image</div>'
html += f'''<div class="visibility">{p['visibility'] if p['visibility'] else 'Visible to anyone on or off LinkedIn'}</div>
</div>'''
html += '''</div>
<div class="footer">
<p>Scraped from LinkedIn using cookies from Falkon browser</p>
<p>Total: ''' + str(len(posts)) + ''' posts</p>
</div>
</div>
</body>
</html>'''
with open('/home/esotericwarfare/opencode/george_hotz_linkedin.html', 'w') as f:
f.write(html)
print(f'HTML created with {len(posts)} posts')