Source: site.view [edit]
Function name: scrapeRobsCategories
Arguments:
Description: Extract all the Perfect Cartoons on Rob's Google Doc pages
Page type: webl
Render function:  
Module: perfectCartoon

Page source:

var cartoons = [. .];

var urls = ["https://docs.google.com/document/d/1MCr7ZIj03ae3Jq3D0CixGTfYPv7mpsFVUrxqVLG2iBQ/pub"];


var getBlissAttributes = fun(P, info)
 
   var txt = nil;
   every p in Elem(P, "p") do
     var c = p.class ? nil;   
     if (c == "product-image") then
        var Img = Elem(P, "img") inside p;
        if Size(Img) == 1 then
           txt = Str_Trim(ExpandCharEntities(Img[0].alt));
           info.text := txt;
           var tags = WubCall("generateCategories", [txt]);  
           if tags != [] then
              info.cats := ToList(ToSet(info.cats + tags))
           end;
           info.imgUrl := Img[0].src;
           info.imgHeight := 350;
           info.imgWidth := 350;
        end;
     
        info.author := "Harry Bliss";
        return info
     end    
   end;
   return info
end;
     
     
var getNastAttributes = fun(P, info)
 
   var txt = nil;
   every d in Elem(P, "div") do
     var c = d.id ? nil;   
     if (c == "productHeading") then
        var H1 = Elem(P, "h1") inside d;
        if Size(H1) == 1 then
           txt = Str_Trim(ExpandCharEntities(Text(H1[0])));
        end;
     
        every span in (Elem(P, "span") inside d) do
           var c = span.id ? nil;
           if (c == "artistName") then
              info.author := Str_Trim(Text(span))
           end
        end
     elsif (c == "productImage") then
        var Img = Elem(P, "img") inside d;
        if Size(Img) == 1 then
           info.imgUrl := Img[0].src;
           info.imgHeight := Img[0].height ? nil;
           info.imgWidth := Img[0].width ? nil;
        end;
     end    
   end;
  
   if (txt != nil) then
      info.text := txt;
      var tags = WubCall("generateCategories", [txt]);  
      if tags != [] then
         info.cats := ToList(ToSet(info.cats + tags))
      end
   end;
   return info
end;

var extractBitly = fun(u)
   u = Url_Decode(u);
   var i = Str_IndexOf("q=http://bit.ly", u);
   if (i >= 0) then
      u = Select(u, i+2, Size(u));
      i = Str_IndexOf(`&`, u);
      if (i > 0) then
         u = Select(u, 0, i)
      end
   end;
   return u;     
end;
     

var getExtendedAttributes = fun(url, info)

   var P = GetURL(url) ? nil;
   if (P != nil) then
      if Str_IndexOf("Nast Collection", Markup(P)) > 0 then
         return getNastAttributes(P, info)
      elsif Str_IndexOf("harrybliss.com", Markup(P)) > 0 then
         return getBlissAttributes(P, info)
      end
   end;
   return info
end;

var processCategory = fun(chapterCats, url)
   var P = GetURL(url);
   every img in Elem(P, "img") do
      var cats=chapterCats;
      // if Size(ToList(cartoons)) > 20 then
      //   return
      // end;
      var A = Elem(P, "a") after img;
      var a = A[0].href ? nil;
      if (a != nil) then
         a = extractBitly(a);
         var info = cartoons[a] ? [. imgUrl = nil, cats = [], text = nil, author = nil .];
         var oldCats = info.cats;
         cats = ToList(ToSet(oldCats + cats));
         info.cats := cats;
         // Don't get extended attributes twice.  Just add categories above.
         if (info.imgUrl == nil) then
            cartoons[a] := getExtendedAttributes(a, info)
         else
            cartoons[a] := info
         end
      end
   end;
end;
  
  
var processCategories = fun(P)     
   every a in Elem(P, "a") do
      var s = Str_Trim(Text(a));
       // if Size(ToList(cartoons)) > 20 then
       //     return
       // end;

      var i = Str_IndexOf("-Pub", s);
      if (i > 0) then
         var cats = Str_Split(ExpandCharEntities(Select(s, 0, i)), ",&/-");
         processCategory(cats, a.href);
      end
   end
end;

var c = fun(s)
   if s == nil then
      return ""
   else
      return Str_Trim(Wub_ReplaceAll(ToString(s), "\t", " "))
   end
end;
     
var cats = fun(lst)
  var s = "";     
  every cat in lst do
     s = s + Str_Trim(ToString(cat)) + ", "
  end;
  if s != "" then
     s = Select(s, 0, Size(s) - 2)
  end;
  return Str_Trim(s)
end;
     
var main = fun()
  every catUrl in urls do
    var P = GetURL(catUrl);
    processCategories(P)
  end
end;
  
main();   
     
Wub_DeleteData("cartooninfo");     
every url in cartoons do
   var data = c(url) + "\t" + c(cartoons[url].imgUrl) + "\t" + c(cartoons[url].text)
     + "\t" + c(cartoons[url].text) + "\t" + c(cartoons[url].author)
     + "\t" + cats(cartoons[url].cats) + "\t2.0\n";
   Wub_AppendData("cartooninfo", data);
end;
Wub_ReadData("cartooninfo");