Linkedin scraper

Linkedin está sobrecargado y es extremadamente lento, acá podés usar estos 2 scripts para evitar ingresar a la página. Borré mis cookies vas a tener que extraer las tuyas. El resultado es esto: https://imlauer.github.io/george_hotz_linkedin.html

Cortesía de OpenCode.

`linkedin.js` te genera el json

const puppeteer = require('puppeteer');
const fs = require('fs');

const cookies = [
  { domain: '.www.linkedin.com', name: 'li_at', value: '', path: '/', secure: true, httpOnly: false },
  { domain: '.www.linkedin.com', name: 'li_rm', value: '', path: '/', secure: true, httpOnly: false },
];

const urls = [
  'https://www.linkedin.com/in/george-hotz-b3866476/recent-activity/posts/',
  'https://www.linkedin.com/in/george-hotz-b3866476/recent-activity/',
];

async function scrapePage(page, url) {
  console.log(`\n=== Loading ${url} ===`);
  await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120000 });
  await new Promise(r => setTimeout(r, 15000));
  
  let scrollCount = 0;
  let lastPostCount = 0;
  let noChangeCount = 0;
  
  while (noChangeCount < 15 && scrollCount < 150) {
    await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
    await new Promise(r => setTimeout(r, 2500));
    
    const postCount = await page.evaluate(() => {
      return document.querySelectorAll('article, .feed-shared-update-v2').length;
    });
    
    console.log(`Scroll ${scrollCount} - Posts: ${postCount}`);
    
    if (postCount > lastPostCount) {
      noChangeCount = 0;
      lastPostCount = postCount;
    } else {
      noChangeCount++;
      
      await page.evaluate(() => {
        const buttons = Array.from(document.querySelectorAll('button'));
        for (const btn of buttons) {
          const text = btn.innerText || btn.textContent || '';
          if (text.includes('Show') || text.includes('See more')) {
            try { btn.click(); } catch(e) {}
          }
        }
      });
      await new Promise(r => setTimeout(r, 1500));
    }
    scrollCount++;
  }
  
  const posts = await page.evaluate(() => {
    const results = [];
    document.querySelectorAll('article, .feed-shared-update-v2').forEach(article => {
      const text = (article.innerText || article.textContent || '').trim();
      if (text.length > 100 && text.length < 20000) {
        results.push(text);
      }
    });
    return results;
  });
  
  return posts;
}

async function main() {
  const browser = await puppeteer.launch({
    headless: 'new',
    args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled']
  });

  const page = await browser.newPage();
  await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
  
  await page.evaluateOnNewDocument(() => {
    Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
  });
  
  await page.setCookie(...cookies);

  let allPosts = [];
  
  for (const url of urls) {
    try {
      const posts = await scrapePage(page, url);
      console.log(`Found ${posts.length} posts from this page`);
      allPosts = [...allPosts, ...posts];
    } catch(e) {
      console.log(`Error loading ${url}: ${e.message}`);
    }
  }

  const uniquePosts = [...new Set(allPosts)];
  
  console.log(`\n=== Total unique posts: ${uniquePosts.length} ===\n`);
  
  fs.writeFileSync('/home/esotericwarfare/opencode/all_posts_complete.json', JSON.stringify(uniquePosts, null, 2));
  console.log('Saved to all_posts_complete.json');

  console.log('\n--- All posts ---\n');
  uniquePosts.forEach((p, i) => {
    console.log(`=== POST ${i+1} ===`);
    console.log(p.substring(0, 600));
    console.log('\n---\n');
  });

  await browser.close();
}

main().catch(console.error);

`generate_html.py` genera el html a partir del JSON.

import json

with open('/home/esotericwarfare/opencode/all_posts_complete.json', 'r') as f:
    posts = json.load(f)

html = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>George Hotz - LinkedIn Posts</title>
    <style>
        * { box-sizing: border-box; margin: 0; padding: 0; }
        body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: #f3f2ef; color: #191919; padding: 20px; }
        .container { max-width: 700px; margin: 0 auto; }
        header { background: white; padding: 20px; border-radius: 8px; margin-bottom: 20px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
        header h1 { font-size: 24px; color: #0a66c2; margin-bottom: 5px; }
        header .title { font-size: 14px; color: #666; }
        header .followers { font-size: 12px; color: #666; margin-top: 8px; }
        .post { background: white; border-radius: 8px; padding: 16px; margin-bottom: 12px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
        .post-header { display: flex; align-items: center; margin-bottom: 12px; }
        .post-avatar { width: 48px; height: 48px; border-radius: 50%; background: linear-gradient(135deg, #667eea, #764ba2); display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 18px; margin-right: 12px; }
        .post-info { flex: 1; }
        .post-name { font-weight: 600; font-size: 14px; }
        .post-role { font-size: 12px; color: #666; }
        .post-time { font-size: 12px; color: #666; }
        .post-content { font-size: 14px; white-space: pre-wrap; word-wrap: break-word; }
        .post-stats { display: flex; gap: 15px; margin-top: 12px; padding-top: 12px; border-top: 1px solid #eee; font-size: 12px; color: #666; }
        .visibility { font-size: 12px; color: #666; margin-top: 8px; }
        .repost-label { background: #eef3f8; color: #0a66c2; padding: 12px; border-radius: 4px; margin-bottom: 12px; }
        .image-placeholder { background: #f3f2ef; border-radius: 8px; padding: 40px; text-align: center; color: #666; margin: 12px 0; }
        .footer { text-align: center; padding: 40px; color: #666; }
    </style>
</head>
<body>
    <div class="container">
        <header>
            <h1>George Hotz</h1>
            <div class="title">President at comma.ai (we're hiring, comma.ai/jobs)</div>
            <div class="title">San Diego, California, United States</div>
            <div class="followers">56,325 followers</div>
        </header>
        <div id="posts">'''

def parse_post(text):
    lines = text.split('\n')
    post = {'content': '', 'time': '', 'visibility': '', 'is_repost': 'reposted this' in text, 'has_image': 'Activate to view larger image' in text, 'edited': 'Edited' in text}
    
    content_lines = []
    for line in lines:
        if 'ago' in line and ('hour' in line or 'day' in line or 'week' in line or 'month' in line or 'year' in line):
            post['time'] = line.strip()
        elif 'Visible to' in line:
            post['visibility'] = line.strip()
        elif 'ago' not in line and 'Feed post' not in line and 'Following' not in line and 'President at' not in line and line.strip() and not line.startswith('•'):
            if not line.replace('.','').isdigit():
                content_lines.append(line)
    
    post['content'] = '\n'.join(content_lines).strip()
    return post

for i, post_text in enumerate(posts):
    p = parse_post(post_text)
    time_display = p['time'] if p['time'] else 'some time ago'
    edited_display = ' · Edited' if p['edited'] else ''
    
    html += f'''
        <div class="post">
            <div class="post-header">
                <div class="post-avatar">GH</div>
                <div class="post-info">
                    <div class="post-name">George Hotz</div>
                    <div class="post-role">President at comma.ai</div>
                    <div class="post-time">{time_display}{edited_display}</div>
                </div>
            </div>'''
    
    if p['is_repost']:
        html += '<div class="repost-label"><strong>George Hotz reposted this</strong></div>'
    
    content = p['content'].replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
    html += f'<div class="post-content">{content}</div>'
    
    if p['has_image']:
        html += '<div class="image-placeholder">🖼️ Image</div>'
    
    html += f'''<div class="visibility">{p['visibility'] if p['visibility'] else 'Visible to anyone on or off LinkedIn'}</div>
        </div>'''

html += '''</div>
        <div class="footer">
            <p>Scraped from LinkedIn using cookies from Falkon browser</p>
            <p>Total: ''' + str(len(posts)) + ''' posts</p>
        </div>
    </div>
</body>
</html>'''

with open('/home/esotericwarfare/opencode/george_hotz_linkedin.html', 'w') as f:
    f.write(html)

print(f'HTML created with {len(posts)} posts')

Linkedin scraper

April 04, 2026 Cli

linkedin.js te genera el json

generate_html.py genera el html a partir del JSON.

April 04, 2026
Cli

`linkedin.js` te genera el json

`generate_html.py` genera el html a partir del JSON.