Last data update: 2014.03.03

R: Facebook, Google, and LinkedIn IPO filings
ipoR Documentation

Facebook, Google, and LinkedIn IPO filings

Description

On Feb 1st, 2011, Facebook Inc. filed an S-1 form with the Securities and Exchange Commission as part of their initial public offering (IPO). This dataset includes the text of that document as well as text from the IPOs of two competing companies: Google and LinkedIn.

Usage

data(ipo)

Format

The format is a list of three character vectors. Each vector contains the line-by-line text of the IPO Prospectus of Facebook, Google, and LinkedIn, respectively.

Details

Each of the three prospectuses is encoded in UTF-8 format and contains some non-word characters related to the layout of the original documents. For analysis on the words, it is recommended that the data be processed with packages such as tm and stringr. See example below.

Source

All IPO prospectuses are available from www.sec.gov: Facebook, Google, LinkedIn.

References

http://blogs.wsj.com/totalreturn/2012/02/06/mark-zuckerberg-ceo-for-life/

Credit to Qian Liu at the Wealthfront Blog for the data links and wordcloud example below.

Examples

data(ipo)
## Not run: 
# install.packages("tm")
# install.packages("wordcloud")
library(tm)
library(wordcloud)

# pre-process data
corp <- Corpus(VectorSource(ipo), readerControl=list(language="en"))
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, tolower)
corp <- tm_map(corp, removeNumbers)
corp <- tm_map(corp, function(x)removeWords(x,stopwords()))
f    <- corp[1] # facebook
g    <- corp[2] # google
l    <- corp[3] # linkedin

tmat      <- TermDocumentMatrix(f)
m         <- as.matrix(tmat)
freq      <- rowSums(m)
words     <- rownames(m)
words.ord <- sort.int(freq, decreasing = T, index.return = F)
barplot(words.ord[1:15], las = 2)

wordcloud(words, freq, min.freq = 100, col='blue')

tmat <- TermDocumentMatrix(c(f, g))
m    <- as.matrix(tmat)
comparison.cloud(m, max.words = 100)

## End(Not run)

Results


R version 3.3.1 (2016-06-21) -- "Bug in Your Hair"
Copyright (C) 2016 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> library(OIdata)
Loading required package: RCurl
Loading required package: bitops
Loading required package: maps

 # maps v3.1: updated 'world': all lakes moved to separate new #
 # 'lakes' database. Type '?world' or 'news(package="maps")'.  #


> png(filename="/home/ddbj/snapshot/RGM3/R_CC/result/OIdata/ipo.Rd_%03d_medium.png", width=480, height=480)
> ### Name: ipo
> ### Title: Facebook, Google, and LinkedIn IPO filings
> ### Aliases: ipo
> ### Keywords: datasets ipo corpus text mining
> 
> ### ** Examples
> 
> data(ipo)
> ## Not run: 
> ##D # install.packages("tm")
> ##D # install.packages("wordcloud")
> ##D library(tm)
> ##D library(wordcloud)
> ##D 
> ##D # pre-process data
> ##D corp <- Corpus(VectorSource(ipo), readerControl=list(language="en"))
> ##D corp <- tm_map(corp, removePunctuation)
> ##D corp <- tm_map(corp, tolower)
> ##D corp <- tm_map(corp, removeNumbers)
> ##D corp <- tm_map(corp, function(x)removeWords(x,stopwords()))
> ##D f    <- corp[1] # facebook
> ##D g    <- corp[2] # google
> ##D l    <- corp[3] # linkedin
> ##D 
> ##D tmat      <- TermDocumentMatrix(f)
> ##D m         <- as.matrix(tmat)
> ##D freq      <- rowSums(m)
> ##D words     <- rownames(m)
> ##D words.ord <- sort.int(freq, decreasing = T, index.return = F)
> ##D barplot(words.ord[1:15], las = 2)
> ##D 
> ##D wordcloud(words, freq, min.freq = 100, col='blue')
> ##D 
> ##D tmat <- TermDocumentMatrix(c(f, g))
> ##D m    <- as.matrix(tmat)
> ##D comparison.cloud(m, max.words = 100)
> ## End(Not run)
> 
> 
> 
> 
> 
> dev.off()
null device 
          1 
>