Add scraping and gitlab scripts (#206)

Co-authored-by: Yethal <nosuchemail@email.com>
2025-08-01 22:57:46 +00:00 · 2022-04-14 20:56:09 +02:00 · 2022-04-14 20:56:09 +02:00 · 4a1ccf0e43
commit 4a1ccf0e43
parent 34c241172b
4 changed files with 76 additions and 0 deletions
--- a/gitlab/README.md
+++ b/gitlab/README.md
@ -0,0 +1,13 @@
+# Gitlab Scanner
+
+### Definition
+
+I use this script to scan contents of my company's GitLab server. Due to nushell's use of multithreading I'm able to scan around 1k repositories in about 9 seconds
+
+### Setup
+1. Generate GitLab Personal Access Token and save it as `GITLAB_TOKEN` environment variable
+2. Run the script providing necessary data as arguments (or hardcode them in the script if you don't expect them to change often)
+### Possible future improvements
+* Multiple files/phrases/branches to search
+* Maybe create some stats like how many times a given phrase was found in a repo or file
+* Maybe offer an option to replace a phrase and automatically push the updated file or create a merge request
--- a/gitlab/gitlab.nu
+++ b/gitlab/gitlab.nu
@ -0,0 +1,38 @@
+#!/usr/bin/env nu
+
+let page_size = 100
+let projects = $"($base_url)/api/v4/projects/"
+
+def call-gitlab [
+  ...args: string
+  --query: string
+] {
+  fetch -H [Authorization $"Bearer ($env.GITLAB_TOKEN)"] $"($projects)($args|str collect)?($query)"
+}
+# Search files on your GitLab server
+def main [
+  --base_url: string # base url of your GitLab instance
+  --file: string # file (or path to file if in a subfolder) you want to scan
+  --phrase: string # phrase you want to search for
+  --branch: string # branch to scan
+] {
+    # /projects endpoint can return up to $page_size items which is why we need multiple calls to retrieve full list
+  let num-of-pages = ((call-gitlab --query 'page=1&per_page=1&order_by=id&simple=true'|get id.0|into int) / $page_size|math round)
+  seq 1 $num-of-pages|par-each {|page|
+    call-gitlab --query $"page=($page)&per_page=($page_size)"|select name id
+  }
+  |flatten
+  |par-each {|repo|
+    let payload = (call-gitlab $repo.id '/repository/files/' $file --query $"ref=($branch)")
+    if ($payload|columns|find message|empty?) {
+      $payload
+      |get content
+      |hash base64 --decode
+      |lines
+      |find $phrase
+      |if ($in|length) > 0 {
+          echo $"($file) in ($repo.name) repo contains ($phrase) phrase"
+        }
+    }
+  }
+}
--- a/webscraping/README.md
+++ b/webscraping/README.md
@ -0,0 +1,5 @@
+# Web Scraping
+
+### Definition
+
+Simple scripts to demonstrate how to scrape websites in nushell. Requires `query web` plugin
--- a/webscraping/nuschiit.nu
+++ b/webscraping/nuschiit.nu
@ -0,0 +1,20 @@
+#!/usr/bin/env nu
+let baseurl = 'https://www.schiit.co.uk/'
+let pages = ['headphone-amps' 'dacs' 'schiit-gaming-products' 'power-amplifiers' 'preamps' 'upgrades' 'accessories-cables' 'schiit%20graded%20stock']
+
+# Simple script to check stock of https://schiit.co.uk store
+def main [] {
+  $pages|par-each {|page|
+  fetch $"($baseurl)($page)"
+  |query web -q 'div.caption' -m
+  |par-each {|item|
+    $item
+    |query web -q 'p.stock, h5'
+    |rotate --ccw name availability
+  }
+  |flatten
+}
+|flatten
+|uniq
+|sort-by availability
+}