Skip to content

Commit 055c28a

Browse files
authored
add fasttext (alibaba#216)
1 parent 5d6b199 commit 055c28a

File tree

25 files changed

+401
-11
lines changed

25 files changed

+401
-11
lines changed

.github/workflows/pipeline.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
matrix:
1212
node_version: ['12']
1313
os: [ubuntu-latest]
14-
pipeline: ['mnist-image-classification', 'databinding-image-classification', 'text-bayes-classification']
14+
pipeline: ['mnist-image-classification', 'databinding-image-classification', 'text-bayes-classification', 'fasttext']
1515
steps:
1616
- uses: actions/checkout@v1
1717
- name: Using Node.js ${{ matrix.node_version }}

package-lock.json

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/costa/src/client.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ async function emitStart(message: PluginMessage): Promise<void> {
6262
const { params } = message;
6363
const pkg = params[0] as PluginPackage;
6464
const [ , ...pluginArgs ] = params;
65-
debug(`start loading plugin ${pkg.name}`);
65+
console.info(`start loading plugin ${pkg.name}`);
6666

6767
try {
6868
const boa = require('@pipcook/boa');
@@ -118,7 +118,7 @@ async function emitStart(message: PluginMessage): Promise<void> {
118118
if (resp) {
119119
const rid = uuid.v4();
120120
previousResults[rid] = resp;
121-
debug(`create a result "${rid}" for plugin "${pkg.name}@${pkg.version}"`);
121+
console.info(`create a result "${rid}" for plugin "${pkg.name}@${pkg.version}"`);
122122
recv(PluginOperator.WRITE, rid);
123123
} else {
124124
recv(PluginOperator.WRITE);

packages/plugins/data-access/csv-data-access/src/index.ts

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,6 @@ const csvDataAccess: DataAccessType = async (args: ArgsType): Promise<CsvDataset
4545
labelColumn
4646
} = args;
4747

48-
assert.ok(labelColumn, 'please specify the column name of your label');
49-
5048
const data: any = {
5149
dataStatistics: [],
5250
validationResult: {
@@ -58,13 +56,13 @@ const csvDataAccess: DataAccessType = async (args: ArgsType): Promise<CsvDataset
5856
};
5957

6058
const names: string[] = [];
61-
if (fs.existsSync(path.join(dataDir, 'train.csv'))) {
59+
if (fs.existsSync(path.join(dataDir, 'train.csv')) && labelColumn) {
6260
data.trainLoader = new DataLoader(path.join(dataDir, 'train.csv'), labelColumn);
6361
}
64-
if (fs.existsSync(path.join(dataDir, 'validation.csv'))) {
62+
if (fs.existsSync(path.join(dataDir, 'validation.csv')) && labelColumn) {
6563
data.validationLoader = new DataLoader(path.join(dataDir, 'validation.csv'), labelColumn);
6664
}
67-
if (fs.existsSync(path.join(dataDir, 'test.csv'))) {
65+
if (fs.existsSync(path.join(dataDir, 'test.csv')) && labelColumn) {
6866
data.testLoader = new DataLoader(path.join(dataDir, 'test.csv'), labelColumn);
6967
}
7068

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
node_modules
2+
.vscode/
3+
tsconfig.tsbuildinfo

packages/plugins/data-collect/fasttext-data-collect/package-lock.json

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"name": "@pipcook/plugins-fasttext-data-collect",
3+
"version": "0.5.9",
4+
"description": "",
5+
"main": "dist/index",
6+
"types": "dist/index",
7+
"files": [
8+
"dist"
9+
],
10+
"scripts": {
11+
"test": "echo \"Error: no test specified\" && exit 1",
12+
"build": "npm run clean && npm run compile",
13+
"clean": "rm -rf ./dist && rm -rf tsconfig.tsbuildinfo",
14+
"compile": "tsc -b tsconfig.json"
15+
},
16+
"author": "",
17+
"license": "ISC",
18+
"dependencies": {
19+
"@pipcook/pipcook-core": "^0.5.16",
20+
"byline": "^5.0.0",
21+
"tar-stream": "^2.1.2"
22+
},
23+
"devDependencies": {
24+
"@types/byline": "^4.2.32",
25+
"@types/jasmine": "^3.5.7",
26+
"@types/tar-stream": "^2.1.0",
27+
"nyc": "14.1.1"
28+
},
29+
"publishConfig": {
30+
"access": "public"
31+
},
32+
"pipcook": {
33+
"category": "dataCollect",
34+
"datatype": "text"
35+
}
36+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import { DataCollectType, ArgsType, download } from '@pipcook/pipcook-core';
2+
import { PassThrough } from 'stream';
3+
import tar from 'tar-stream';
4+
import byline from 'byline';
5+
import { createUnzip } from 'zlib';
6+
import { createReadStream, createWriteStream } from 'fs';
7+
8+
async function extract(src: string, outputDir: string, trainNum: number): Promise<void> {
9+
return new Promise((resolve, reject) => {
10+
const extractor = tar.extract();
11+
extractor.on('finish', resolve);
12+
extractor.on('error', reject);
13+
extractor.on('entry', (headers: tar.Headers, entry: PassThrough, next: () => void) => {
14+
if (headers.name.endsWith('stackexchange.txt')) {
15+
let linenum = 0;
16+
const trainCsv = createWriteStream(`${outputDir}/train.csv`);
17+
const testCsv = createWriteStream(`${outputDir}/test.csv`);
18+
const lineStream = byline.createStream(entry);
19+
lineStream.on('data', (line) => {
20+
line = line.toString('utf8').replace(/([.\!?,'/()])/g, (x: string) => ` ${x} `).toLowerCase();
21+
if (linenum++ < trainNum) {
22+
trainCsv.write(line + '\n');
23+
} else {
24+
testCsv.write(line + '\n');
25+
}
26+
});
27+
entry.on('end', () => {
28+
trainCsv.end();
29+
testCsv.end();
30+
});
31+
}
32+
entry.on('end', next);
33+
entry.resume();
34+
});
35+
createReadStream(src).pipe(createUnzip()).pipe(extractor);
36+
});
37+
}
38+
39+
const fasttextDataCollect: DataCollectType = async (args: ArgsType): Promise<void> => {
40+
const {
41+
url = 'https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz',
42+
trainNum = 12404,
43+
dataDir
44+
} = args;
45+
46+
const datasetPathname = `${dataDir}/dataset.tar.gz`;
47+
await download(url, datasetPathname);
48+
await extract(datasetPathname, dataDir, trainNum);
49+
};
50+
51+
export default fasttextDataCollect;
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"extends": "../../../../tsconfig.json",
3+
"compilerOptions": {
4+
"outDir": "./dist",
5+
"rootDir": "./src"
6+
},
7+
"exclude": [
8+
"node_modules",
9+
"dist"
10+
]
11+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
node_modules
2+
.vscode/
3+
tsconfig.tsbuildinfo

0 commit comments

Comments
 (0)